Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -999,45 +999,31 @@ target_datasets = {
|
|
999 |
|
1000 |
def get_korea_datasets():
|
1001 |
"""Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฒ์"""
|
1002 |
-
|
1003 |
-
|
|
|
|
|
|
|
1004 |
|
1005 |
-
|
1006 |
-
|
1007 |
-
"
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
|
1012 |
-
|
1013 |
-
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
)
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
-
|
1022 |
-
print(f"Found {len(datasets)} datasets for search term '{term}'")
|
1023 |
-
else:
|
1024 |
-
print(f"Failed to fetch datasets for term '{term}': {response.status_code}")
|
1025 |
-
except Exception as e:
|
1026 |
-
print(f"Error fetching datasets for term '{term}': {str(e)}")
|
1027 |
-
|
1028 |
-
# ์ค๋ณต ์ ๊ฑฐ
|
1029 |
-
seen_ids = set()
|
1030 |
-
unique_datasets = []
|
1031 |
-
for dataset in all_korea_datasets:
|
1032 |
-
dataset_id = dataset.get('id', '')
|
1033 |
-
if dataset_id and dataset_id not in seen_ids:
|
1034 |
-
seen_ids.add(dataset_id)
|
1035 |
-
unique_datasets.append(dataset)
|
1036 |
-
|
1037 |
-
print(f"Total unique Korea-related datasets found: {len(unique_datasets)}")
|
1038 |
-
return unique_datasets
|
1039 |
|
1040 |
-
def get_all_datasets(limit=10000):
|
1041 |
"""๋ชจ๋ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฐ์ ธ์ค๊ธฐ"""
|
1042 |
all_datasets = []
|
1043 |
page_size = 1000
|
@@ -1049,55 +1035,64 @@ def get_all_datasets(limit=10000): # ๊ธฐ๋ณธ limit ์ฆ๊ฐ
|
|
1049 |
'offset': offset
|
1050 |
}
|
1051 |
|
1052 |
-
|
1053 |
-
|
1054 |
-
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
1062 |
-
|
1063 |
-
else:
|
1064 |
-
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
1065 |
-
break
|
1066 |
-
except Exception as e:
|
1067 |
-
print(f"Error fetching datasets at offset {offset}: {str(e)}")
|
1068 |
break
|
1069 |
|
1070 |
-
# Korea ๊ฒ์
|
1071 |
-
|
1072 |
-
|
|
|
|
|
|
|
1073 |
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
1078 |
-
|
1079 |
-
added_count += 1
|
1080 |
|
1081 |
-
|
1082 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1083 |
|
|
|
1084 |
return all_datasets[:limit]
|
1085 |
|
1086 |
def get_datasets_data(progress=gr.Progress()):
|
1087 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
|
|
1088 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
1089 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
1090 |
|
|
|
1091 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
1092 |
|
1093 |
if is_korea:
|
|
|
1094 |
korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
|
1095 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
1096 |
|
1097 |
if korea_rank:
|
1098 |
-
return min(global_rank or
|
1099 |
|
1100 |
-
return global_rank if global_rank else 'Not in top
|
1101 |
|
1102 |
try:
|
1103 |
progress(0, desc="Fetching datasets...")
|
@@ -1113,9 +1108,15 @@ def get_datasets_data(progress=gr.Progress()):
|
|
1113 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
1114 |
return fig, error_html, empty_df
|
1115 |
|
1116 |
-
|
|
|
1117 |
korea_datasets = get_korea_datasets()
|
1118 |
|
|
|
|
|
|
|
|
|
|
|
1119 |
filtered_datasets = []
|
1120 |
for dataset_id in target_datasets.keys():
|
1121 |
try:
|
|
|
999 |
|
1000 |
def get_korea_datasets():
|
1001 |
"""Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฒ์"""
|
1002 |
+
params = {
|
1003 |
+
"search": "korea",
|
1004 |
+
"full": "True",
|
1005 |
+
"limit": 10000
|
1006 |
+
}
|
1007 |
|
1008 |
+
try:
|
1009 |
+
response = requests.get(
|
1010 |
+
"https://huggingface.co/api/datasets",
|
1011 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
1012 |
+
params=params
|
1013 |
+
)
|
1014 |
|
1015 |
+
if response.status_code == 200:
|
1016 |
+
korea_datasets = response.json()
|
1017 |
+
print(f"Found {len(korea_datasets)} Korea-related datasets")
|
1018 |
+
return korea_datasets
|
1019 |
+
else:
|
1020 |
+
print(f"Failed to fetch Korea datasets: {response.status_code}")
|
1021 |
+
return []
|
1022 |
+
except Exception as e:
|
1023 |
+
print(f"Error fetching Korea datasets: {str(e)}")
|
1024 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1025 |
|
1026 |
+
def get_all_datasets(limit=10000):
|
1027 |
"""๋ชจ๋ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๊ฐ์ ธ์ค๊ธฐ"""
|
1028 |
all_datasets = []
|
1029 |
page_size = 1000
|
|
|
1035 |
'offset': offset
|
1036 |
}
|
1037 |
|
1038 |
+
response = requests.get(
|
1039 |
+
"https://huggingface.co/api/datasets",
|
1040 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
1041 |
+
params=params
|
1042 |
+
)
|
1043 |
+
|
1044 |
+
if response.status_code == 200:
|
1045 |
+
all_datasets.extend(response.json())
|
1046 |
+
print(f"Fetched datasets {offset+1} to {offset+len(response.json())}")
|
1047 |
+
else:
|
1048 |
+
print(f"Failed to fetch datasets at offset {offset}: {response.status_code}")
|
|
|
|
|
|
|
|
|
|
|
1049 |
break
|
1050 |
|
1051 |
+
# Korea ๊ฒ์ ๊ฒฐ๊ณผ๋ ๋์ผํ๊ฒ ํ์ฅ
|
1052 |
+
korea_params = {
|
1053 |
+
"search": "korea",
|
1054 |
+
"full": "True",
|
1055 |
+
"limit": limit
|
1056 |
+
}
|
1057 |
|
1058 |
+
korea_response = requests.get(
|
1059 |
+
"https://huggingface.co/api/datasets",
|
1060 |
+
headers={'Accept': 'application/json'}, # Authorization ๋์ Accept ํค๋ ์ฌ์ฉ
|
1061 |
+
params=korea_params
|
1062 |
+
)
|
|
|
1063 |
|
1064 |
+
if korea_response.status_code == 200:
|
1065 |
+
korea_datasets = korea_response.json()
|
1066 |
+
print(f"Fetched {len(korea_datasets)} Korea-related datasets")
|
1067 |
+
|
1068 |
+
# ์ค๋ณต ์ ๊ฑฐํ๋ฉด์ Korea ๋ฐ์ดํฐ์
์ถ๊ฐ
|
1069 |
+
existing_ids = {dataset.get('id', '') for dataset in all_datasets}
|
1070 |
+
for dataset in korea_datasets:
|
1071 |
+
if dataset.get('id', '') not in existing_ids:
|
1072 |
+
all_datasets.append(dataset)
|
1073 |
+
existing_ids.add(dataset.get('id', ''))
|
1074 |
|
1075 |
+
print(f"Total unique datasets: {len(all_datasets)}")
|
1076 |
return all_datasets[:limit]
|
1077 |
|
1078 |
def get_datasets_data(progress=gr.Progress()):
|
1079 |
def calculate_rank(dataset_id, all_global_datasets, korea_datasets):
|
1080 |
+
# ๊ธ๋ก๋ฒ ์์ ํ์ธ
|
1081 |
global_rank = next((idx for idx, d in enumerate(all_global_datasets, 1)
|
1082 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
1083 |
|
1084 |
+
# Korea ๋ฐ์ดํฐ์
์ธ ๊ฒฝ์ฐ
|
1085 |
is_korea = any(d.get('id', '').strip() == dataset_id.strip() for d in korea_datasets)
|
1086 |
|
1087 |
if is_korea:
|
1088 |
+
# Korea ๋ฐ์ดํฐ์
์ค์์์ ์์ ํ์ธ
|
1089 |
korea_rank = next((idx for idx, d in enumerate(korea_datasets, 1)
|
1090 |
if d.get('id', '').strip() == dataset_id.strip()), None)
|
1091 |
|
1092 |
if korea_rank:
|
1093 |
+
return min(global_rank or 10001, korea_rank + 1000), True
|
1094 |
|
1095 |
+
return global_rank if global_rank else 'Not in top 10000', is_korea
|
1096 |
|
1097 |
try:
|
1098 |
progress(0, desc="Fetching datasets...")
|
|
|
1108 |
empty_df = pd.DataFrame(columns=['Global Rank', 'Dataset ID', 'Title', 'Downloads', 'Likes', 'Korea Search', 'URL'])
|
1109 |
return fig, error_html, empty_df
|
1110 |
|
1111 |
+
# ์ผ๋ฐ ๋ฐ์ดํฐ์
๊ณผ Korea ๊ด๋ จ ๋ฐ์ดํฐ์
๋ชจ๋ ๊ฐ์ ธ์ค๊ธฐ
|
1112 |
+
all_global_datasets = get_all_datasets(limit=10000)
|
1113 |
korea_datasets = get_korea_datasets()
|
1114 |
|
1115 |
+
print(f"Total global datasets fetched: {len(all_global_datasets)}")
|
1116 |
+
print(f"Total Korea datasets fetched: {len(korea_datasets)}")
|
1117 |
+
|
1118 |
+
|
1119 |
+
|
1120 |
filtered_datasets = []
|
1121 |
for dataset_id in target_datasets.keys():
|
1122 |
try:
|