openfree commited on
Commit
8962cc4
ยท
verified ยท
1 Parent(s): d1bc4aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -66
app.py CHANGED
@@ -999,45 +999,31 @@ target_datasets = {
999
 
1000
  def get_korea_datasets():
1001
  """Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฒ€์ƒ‰"""
1002
- search_terms = ['korea', 'korean', 'kor'] # ๊ฒ€์ƒ‰์–ด ํ™•์žฅ
1003
- all_korea_datasets = []
 
 
 
1004
 
1005
- for term in search_terms:
1006
- params = {
1007
- "search": term,
1008
- "full": "True",
1009
- "limit": 10000 # ๊ฒ€์ƒ‰ ๋ฒ”์œ„ ํ™•์žฅ
1010
- }
1011
 
1012
- try:
1013
- response = requests.get(
1014
- "https://huggingface.co/api/datasets",
1015
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1016
- params=params
1017
- )
1018
-
1019
- if response.status_code == 200:
1020
- datasets = response.json()
1021
- all_korea_datasets.extend(datasets)
1022
- print(f"Found {len(datasets)} datasets for search term '{term}'")
1023
- else:
1024
- print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
1025
- except Exception as e:
1026
- print(f"Error fetching datasets for term '{term}': {str(e)}")
1027
-
1028
- # ์ค‘๋ณต ์ œ๊ฑฐ
1029
- seen_ids = set()
1030
- unique_datasets = []
1031
- for dataset in all_korea_datasets:
1032
- dataset_id = dataset.get('id', '')
1033
- if dataset_id and dataset_id not in seen_ids:
1034
- seen_ids.add(dataset_id)
1035
- unique_datasets.append(dataset)
1036
-
1037
- print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
1038
- return unique_datasets
1039
 
1040
- def get_all_datasets(limit=10000): # ๊ธฐ๋ณธ limit ์ฆ๊ฐ€
1041
  """๋ชจ๋“  ๋ฐ์ดํ„ฐ์…‹๊ณผ Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฐ€์ ธ์˜ค๊ธฐ"""
1042
  all_datasets = []
1043
  page_size = 1000
@@ -1049,55 +1035,64 @@ def get_all_datasets(limit=10000): # ๊ธฐ๋ณธ limit ์ฆ๊ฐ€
1049
  'offset': offset
1050
  }
1051
 
1052
- try:
1053
- response = requests.get(
1054
- "https://huggingface.co/api/datasets",
1055
- headers={'Authorization': f'Bearer {HF_TOKEN}'},
1056
- params=params
1057
- )
1058
-
1059
- if response.status_code == 200:
1060
- datasets = response.json()
1061
- all_datasets.extend(datasets)
1062
- print(f"Fetched datasets {offset+1} to {offset+len(datasets)}")
1063
- else:
1064
- print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
1065
- break
1066
- except Exception as e:
1067
- print(f"Error fetching datasets at offset {offset}: {str(e)}")
1068
  break
1069
 
1070
- # Korea ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”๊ฐ€
1071
- korea_datasets = get_korea_datasets()
1072
- existing_ids = {dataset.get('id', '') for dataset in all_datasets}
 
 
 
1073
 
1074
- added_count = 0
1075
- for korea_dataset in korea_datasets:
1076
- if korea_dataset.get('id', '') not in existing_ids:
1077
- all_datasets.append(korea_dataset)
1078
- existing_ids.add(korea_dataset.get('id', ''))
1079
- added_count += 1
1080
 
1081
- print(f"Added {added_count} additional Korea-related datasets")
1082
- print(f"Total datasets: {len(all_datasets)}")
 
 
 
 
 
 
 
 
1083
 
 
1084
  return all_datasets[:limit]
1085
 
1086
  def get_datasets_data(progress=gr.Progress()):
1087
  def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
 
1088
  global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
1089
  if d.get('id', '').strip() == dataset_id.strip()), None)
1090
 
 
1091
  is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
1092
 
1093
  if is_korea:
 
1094
  korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
1095
  if d.get('id', '').strip() == dataset_id.strip()), None)
1096
 
1097
  if korea_rank:
1098
- return min(global_rank or 3001, korea_rank + 1000), True
1099
 
1100
- return global_rank if global_rank else 'Not in top 3000', is_korea
1101
 
1102
  try:
1103
  progress(0, desc="Fetching datasets...")
@@ -1113,9 +1108,15 @@ def get_datasets_data(progress=gr.Progress()):
1113
  empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
1114
  return fig, error_html, empty_df
1115
 
1116
- all_global_datasets = get_all_datasets(limit=3000)
 
1117
  korea_datasets = get_korea_datasets()
1118
 
 
 
 
 
 
1119
  filtered_datasets = []
1120
  for dataset_id in target_datasets.keys():
1121
  try:
 
999
 
1000
  def get_korea_datasets():
1001
  """Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฒ€์ƒ‰"""
1002
+ params = {
1003
+ "search": "korea",
1004
+ "full": "True",
1005
+ "limit": 10000
1006
+ }
1007
 
1008
+ try:
1009
+ response = requests.get(
1010
+ "https://huggingface.co/api/datasets",
1011
+ headers={'Accept': 'application/json'}, # Authorization ๋Œ€์‹  Accept ํ—ค๋” ์‚ฌ์šฉ
1012
+ params=params
1013
+ )
1014
 
1015
+ if response.status_code == 200:
1016
+ korea_datasets = response.json()
1017
+ print(f"Found {len(korea_datasets)} Korea-related datasets")
1018
+ return korea_datasets
1019
+ else:
1020
+ print(f"Failed to fetch Korea datasets: {response.status_code}")
1021
+ return []
1022
+ except Exception as e:
1023
+ print(f"Error fetching Korea datasets: {str(e)}")
1024
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1025
 
1026
+ def get_all_datasets(limit=10000):
1027
  """๋ชจ๋“  ๋ฐ์ดํ„ฐ์…‹๊ณผ Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๊ฐ€์ ธ์˜ค๊ธฐ"""
1028
  all_datasets = []
1029
  page_size = 1000
 
1035
  'offset': offset
1036
  }
1037
 
1038
+ response = requests.get(
1039
+ "https://huggingface.co/api/datasets",
1040
+ headers={'Accept': 'application/json'}, # Authorization ๋Œ€์‹  Accept ํ—ค๋” ์‚ฌ์šฉ
1041
+ params=params
1042
+ )
1043
+
1044
+ if response.status_code == 200:
1045
+ all_datasets.extend(response.json())
1046
+ print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
1047
+ else:
1048
+ print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
 
 
 
 
 
1049
  break
1050
 
1051
+ # Korea ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋„ ๋™์ผํ•˜๊ฒŒ ํ™•์žฅ
1052
+ korea_params = {
1053
+ "search": "korea",
1054
+ "full": "True",
1055
+ "limit": limit
1056
+ }
1057
 
1058
+ korea_response = requests.get(
1059
+ "https://huggingface.co/api/datasets",
1060
+ headers={'Accept': 'application/json'}, # Authorization ๋Œ€์‹  Accept ํ—ค๋” ์‚ฌ์šฉ
1061
+ params=korea_params
1062
+ )
 
1063
 
1064
+ if korea_response.status_code == 200:
1065
+ korea_datasets = korea_response.json()
1066
+ print(f"Fetched {len(korea_datasets)} Korea-related datasets")
1067
+
1068
+ # ์ค‘๋ณต ์ œ๊ฑฐํ•˜๋ฉด์„œ Korea ๋ฐ์ดํ„ฐ์…‹ ์ถ”๊ฐ€
1069
+ existing_ids = {dataset.get('id', '') for dataset in all_datasets}
1070
+ for dataset in korea_datasets:
1071
+ if dataset.get('id', '') not in existing_ids:
1072
+ all_datasets.append(dataset)
1073
+ existing_ids.add(dataset.get('id', ''))
1074
 
1075
+ print(f"Total unique datasets: {len(all_datasets)}")
1076
  return all_datasets[:limit]
1077
 
1078
  def get_datasets_data(progress=gr.Progress()):
1079
  def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
1080
+ # ๊ธ€๋กœ๋ฒŒ ์ˆœ์œ„ ํ™•์ธ
1081
  global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
1082
  if d.get('id', '').strip() == dataset_id.strip()), None)
1083
 
1084
+ # Korea ๋ฐ์ดํ„ฐ์…‹์ธ ๊ฒฝ์šฐ
1085
  is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
1086
 
1087
  if is_korea:
1088
+ # Korea ๋ฐ์ดํ„ฐ์…‹ ์ค‘์—์„œ์˜ ์ˆœ์œ„ ํ™•์ธ
1089
  korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
1090
  if d.get('id', '').strip() == dataset_id.strip()), None)
1091
 
1092
  if korea_rank:
1093
+ return min(global_rank or 10001, korea_rank + 1000), True
1094
 
1095
+ return global_rank if global_rank else 'Not in top 10000', is_korea
1096
 
1097
  try:
1098
  progress(0, desc="Fetching datasets...")
 
1108
  empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
1109
  return fig, error_html, empty_df
1110
 
1111
+ # ์ผ๋ฐ˜ ๋ฐ์ดํ„ฐ์…‹๊ณผ Korea ๊ด€๋ จ ๋ฐ์ดํ„ฐ์…‹ ๋ชจ๋‘ ๊ฐ€์ ธ์˜ค๊ธฐ
1112
+ all_global_datasets = get_all_datasets(limit=10000)
1113
  korea_datasets = get_korea_datasets()
1114
 
1115
+ print(f"Total global datasets fetched: {len(all_global_datasets)}")
1116
+ print(f"Total Korea datasets fetched: {len(korea_datasets)}")
1117
+
1118
+
1119
+
1120
  filtered_datasets = []
1121
  for dataset_id in target_datasets.keys():
1122
  try: