zhiminy commited on
Commit
dd5c0c7
Β·
1 Parent(s): 85a297a

upload_large_folder

Browse files
Files changed (2) hide show
  1. app.py +20 -7
  2. msr.py +20 -8
app.py CHANGED
@@ -128,9 +128,6 @@ def is_rate_limit_error(e):
128
  f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
129
  )
130
  )
131
- def upload_large_folder_with_backoff(api, **kwargs):
132
- """Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
133
- return api.upload_large_folder(**kwargs)
134
 
135
 
136
  @backoff.on_exception(
@@ -181,6 +178,22 @@ def upload_file_with_backoff(api, **kwargs):
181
  return api.upload_file(**kwargs)
182
 
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # =============================================================================
185
  # BIGQUERY FUNCTIONS
186
  # =============================================================================
@@ -880,14 +893,14 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
880
  save_jsonl(local_filename, merged_metadata)
881
  print(f" Prepared {len(merged_metadata)} reviews for {filename}")
882
 
883
- # Upload entire folder using upload_large_folder (optimized for large files)
884
- # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
885
  print(f"πŸ“€ Uploading {len(grouped)} files...")
886
- upload_large_folder_with_backoff(
887
  api=api,
888
  folder_path=temp_dir,
889
  repo_id=REVIEW_METADATA_REPO,
890
- repo_type="dataset"
 
891
  )
892
  print(f" βœ“ Batch upload complete for {agent_identifier}")
893
 
 
128
  f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
129
  )
130
  )
 
 
 
131
 
132
 
133
  @backoff.on_exception(
 
178
  return api.upload_file(**kwargs)
179
 
180
 
181
+ @backoff.on_exception(
182
+ backoff.expo,
183
+ HfHubHTTPError,
184
+ max_tries=8,
185
+ base=300,
186
+ max_value=3600,
187
+ giveup=lambda e: not is_rate_limit_error(e),
188
+ on_backoff=lambda details: print(
189
+ f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
190
+ )
191
+ )
192
+ def upload_folder_with_backoff(api, **kwargs):
193
+ """Wrapper for api.upload_folder() with exponential backoff for rate limits."""
194
+ return api.upload_folder(**kwargs)
195
+
196
+
197
  # =============================================================================
198
  # BIGQUERY FUNCTIONS
199
  # =============================================================================
 
893
  save_jsonl(local_filename, merged_metadata)
894
  print(f" Prepared {len(merged_metadata)} reviews for {filename}")
895
 
896
+ # Upload entire folder using upload_folder (single commit per agent)
 
897
  print(f"πŸ“€ Uploading {len(grouped)} files...")
898
+ upload_folder_with_backoff(
899
  api=api,
900
  folder_path=temp_dir,
901
  repo_id=REVIEW_METADATA_REPO,
902
+ repo_type="dataset",
903
+ commit_message=f"Update review metadata for {agent_identifier}"
904
  )
905
  print(f" βœ“ Batch upload complete for {agent_identifier}")
906
 
msr.py CHANGED
@@ -119,10 +119,6 @@ def is_rate_limit_error(e):
119
  f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
120
  )
121
  )
122
- def upload_large_folder_with_backoff(api, **kwargs):
123
- """Wrapper for api.upload_large_folder() with exponential backoff for rate limits."""
124
- return api.upload_large_folder(**kwargs)
125
-
126
 
127
  @backoff.on_exception(
128
  backoff.expo,
@@ -172,6 +168,22 @@ def upload_file_with_backoff(api, **kwargs):
172
  return api.upload_file(**kwargs)
173
 
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  def get_bigquery_client():
176
  """
177
  Initialize BigQuery client using credentials from environment variable.
@@ -597,14 +609,14 @@ def save_review_metadata_to_hf(metadata_list, agent_identifier):
597
  save_jsonl(local_filename, day_metadata)
598
  print(f" Prepared {len(day_metadata)} reviews for {filename}")
599
 
600
- # Upload entire folder using upload_large_folder (optimized for large files)
601
- # Note: upload_large_folder creates multiple commits automatically and doesn't support custom commit_message
602
  print(f" πŸ“€ Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
603
- upload_large_folder_with_backoff(
604
  api=api,
605
  folder_path=temp_dir,
606
  repo_id=REVIEW_METADATA_REPO,
607
- repo_type="dataset"
 
608
  )
609
  print(f" βœ“ Batch upload complete for {agent_identifier}")
610
 
 
119
  f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
120
  )
121
  )
 
 
 
 
122
 
123
  @backoff.on_exception(
124
  backoff.expo,
 
168
  return api.upload_file(**kwargs)
169
 
170
 
171
+ @backoff.on_exception(
172
+ backoff.expo,
173
+ HfHubHTTPError,
174
+ max_tries=8,
175
+ base=300,
176
+ max_value=3600,
177
+ giveup=lambda e: not is_rate_limit_error(e),
178
+ on_backoff=lambda details: print(
179
+ f"⏳ Rate limited. Retrying in {details['wait']/60:.1f} minutes ({details['wait']:.0f}s) - attempt {details['tries']}/8..."
180
+ )
181
+ )
182
+ def upload_folder_with_backoff(api, **kwargs):
183
+ """Wrapper for api.upload_folder() with exponential backoff for rate limits."""
184
+ return api.upload_folder(**kwargs)
185
+
186
+
187
  def get_bigquery_client():
188
  """
189
  Initialize BigQuery client using credentials from environment variable.
 
609
  save_jsonl(local_filename, day_metadata)
610
  print(f" Prepared {len(day_metadata)} reviews for {filename}")
611
 
612
+ # Upload entire folder using upload_folder (single commit per agent)
 
613
  print(f" πŸ“€ Uploading {len(grouped)} files ({len(metadata_list)} total reviews)...")
614
+ upload_folder_with_backoff(
615
  api=api,
616
  folder_path=temp_dir,
617
  repo_id=REVIEW_METADATA_REPO,
618
+ repo_type="dataset",
619
+ commit_message=f"Update review metadata for {agent_identifier}"
620
  )
621
  print(f" βœ“ Batch upload complete for {agent_identifier}")
622