awacke1 commited on
Commit
da59af8
Β·
verified Β·
1 Parent(s): e174780

Create app.py.v2

Browse files
Files changed (1) hide show
  1. app.py.v2 +276 -0
app.py.v2 ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import requests
4
+ import io
5
+ import dask.dataframe as dd
6
+ from datasets import load_dataset, Image
7
+ from mlcroissant import Dataset as CroissantDataset
8
+ from huggingface_hub import get_token
9
+ import polars as pl
10
+ import warnings
11
+ import traceback
12
+ import json
13
+
14
+ # 🀫 Let's ignore those pesky warnings, shall we?
15
+ warnings.filterwarnings("ignore")
16
+
17
+ # --- βš™οΈ Configuration & Constants ---
18
+ DATASET_CONFIG = {
19
+ "caselaw": {
20
+ "name": "common-pile/caselaw_access_project", "emoji": "βš–οΈ",
21
+ "methods": ["πŸ’¨ API (requests)", "οΏ½ Dask", "πŸ₯ Croissant"], "is_public": True,
22
+ },
23
+ "prompts": {
24
+ "name": "fka/awesome-chatgpt-prompts", "emoji": "πŸ€–",
25
+ "methods": ["🐼 Pandas", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": True,
26
+ },
27
+ "finance": {
28
+ "name": "snorkelai/agent-finance-reasoning", "emoji": "πŸ’°",
29
+ "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
30
+ },
31
+ "medical": {
32
+ "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
33
+ "methods": ["🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
34
+ },
35
+ "inscene": {
36
+ "name": "peteromallet/InScene-Dataset", "emoji": "πŸ–ΌοΈ",
37
+ "methods": ["πŸ€— Datasets", "🐼 Pandas", "🧊 Polars", "πŸ’¨ API (requests)", "πŸ₯ Croissant"], "is_public": False,
38
+ },
39
+ }
40
+
41
+ # --- ν—¬ Helpers & Utility Functions ---
42
+
43
+ def get_auth_headers():
44
+ token = get_token()
45
+ return {"Authorization": f"Bearer {token}"} if token else {}
46
+
47
+ def dataframe_to_outputs(df: pd.DataFrame):
48
+ if df.empty:
49
+ return "No results found. 🀷", None, None, "No results to copy."
50
+ df_str = df.astype(str)
51
+ markdown_output = df_str.to_markdown(index=False)
52
+ csv_buffer = io.StringIO()
53
+ df.to_csv(csv_buffer, index=False)
54
+ csv_buffer.seek(0)
55
+ excel_buffer = io.BytesIO()
56
+ df.to_excel(excel_buffer, index=False, engine='openpyxl')
57
+ excel_buffer.seek(0)
58
+ tab_delimited_output = df.to_csv(sep='\t', index=False)
59
+ return (
60
+ markdown_output,
61
+ gr.File.from_bytes(csv_buffer.getvalue(), "results.csv"),
62
+ gr.File.from_bytes(excel_buffer.getvalue(), "results.xlsx"),
63
+ tab_delimited_output,
64
+ )
65
+
66
+ # --- ✨ NEW Enhanced Error Handler with Debug Logging ---
67
+ def handle_error(e: Exception, request=None, response=None):
68
+ """
69
+ 😱 Oh no! An error! This function now creates a detailed debug log.
70
+ """
71
+ # Basic error info
72
+ error_message = f"🚨 An error occurred: {str(e)}\n"
73
+ auth_tip = "πŸ”‘ For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
74
+ full_trace = traceback.format_exc()
75
+ print(full_trace)
76
+ if "401" in str(e) or "Gated" in str(e):
77
+ error_message += auth_tip
78
+
79
+ # Detailed debug log
80
+ debug_log = f"""
81
+ --- 🐞 DEBUG LOG ---
82
+ Traceback:
83
+ {full_trace}
84
+
85
+ Exception Type: {type(e).__name__}
86
+ Exception Details: {e}
87
+ """
88
+ if request:
89
+ debug_log += f"""
90
+ --- REQUEST ---
91
+ Method: {request.method}
92
+ URL: {request.url}
93
+ Headers: {json.dumps(dict(request.headers), indent=2)}
94
+ """
95
+ if response is not None:
96
+ try:
97
+ response_json = response.json()
98
+ response_text = json.dumps(response_json, indent=2)
99
+ except json.JSONDecodeError:
100
+ response_text = response.text
101
+ debug_log += f"""
102
+ --- RESPONSE ---
103
+ Status Code: {response.status_code}
104
+ Headers: {json.dumps(dict(response.headers), indent=2)}
105
+ Content:
106
+ {response_text}
107
+ """
108
+
109
+ # Return a tuple of 9 to match the outputs
110
+ return (
111
+ pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
112
+ "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
113
+ gr.Code(value=debug_log, visible=True) # Make the debug log visible
114
+ )
115
+
116
+ def search_dataframe(df: pd.DataFrame, query: str):
117
+ if not query:
118
+ return df.head(100)
119
+ string_cols = df.select_dtypes(include=['object', 'string']).columns
120
+ if string_cols.empty:
121
+ return pd.DataFrame()
122
+ mask = pd.Series([False] * len(df))
123
+ for col in string_cols:
124
+ mask |= df[col].astype(str).str.contains(query, case=False, na=False)
125
+ return df[mask]
126
+
127
+ # --- 🎣 Data Fetching & Processing Functions ---
128
+ def fetch_data(dataset_key: str, access_method: str, query: str):
129
+ """
130
+ πŸš€ Main mission control. Always yields a tuple of 9 values to match the UI components.
131
+ """
132
+ # Initialize the state for all 9 output components
133
+ outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
134
+
135
+ req, res = None, None # To hold request/response for debugging
136
+ try:
137
+ config = DATASET_CONFIG[dataset_key]
138
+ repo_id = config["name"]
139
+
140
+ if "API" in access_method:
141
+ all_results_df = pd.DataFrame()
142
+ MAX_PAGES = 5
143
+ PAGE_SIZE = 100
144
+
145
+ if not query:
146
+ MAX_PAGES = 1
147
+ outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
148
+ yield tuple(outputs)
149
+
150
+ for page in range(MAX_PAGES):
151
+ if query:
152
+ outputs[2] = f"⏳ Searching page {page + 1}..."
153
+ yield tuple(outputs)
154
+
155
+ offset = page * PAGE_SIZE
156
+ url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
157
+ headers = get_auth_headers() if not config["is_public"] else {}
158
+
159
+ res = requests.get(url, headers=headers)
160
+ req = res.request
161
+ res.raise_for_status() # Will raise an exception for 4xx/5xx errors
162
+
163
+ data = res.json()
164
+
165
+ if not data.get('rows'):
166
+ outputs[2] = "🏁 No more data to search."
167
+ yield tuple(outputs)
168
+ break
169
+
170
+ page_df = pd.json_normalize(data['rows'], record_path='row')
171
+ found_in_page = search_dataframe(page_df, query)
172
+
173
+ if not found_in_page.empty:
174
+ all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
175
+ outputs[0] = all_results_df
176
+ outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
177
+ outputs[2] = f"βœ… Found **{len(all_results_df)}** results so far..."
178
+
179
+ if dataset_key == 'inscene':
180
+ gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
181
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
182
+ yield tuple(outputs)
183
+
184
+ outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
185
+ yield tuple(outputs)
186
+ return
187
+
188
+ outputs[2] = f"⏳ Loading data via `{access_method}`..."
189
+ yield tuple(outputs)
190
+
191
+ df = pd.DataFrame()
192
+ # Simplified for brevity - expand if needed
193
+ if "Pandas" in access_method:
194
+ file_path = f"hf://datasets/{repo_id}/"
195
+ if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
196
+ else:
197
+ try: df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
198
+ except:
199
+ try: df = pd.read_parquet(f"{file_path}train.parquet")
200
+ except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
201
+ elif "Datasets" in access_method:
202
+ ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
203
+ df = pd.DataFrame(ds)
204
+
205
+ outputs[2] = "πŸ” Searching loaded data..."
206
+ yield tuple(outputs)
207
+
208
+ final_df = search_dataframe(df, query)
209
+
210
+ outputs[0] = final_df
211
+ outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
212
+ outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
213
+
214
+ if dataset_key == 'inscene' and not final_df.empty:
215
+ gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
216
+ outputs[1] = gr.Gallery(gallery_data, label="πŸ–ΌοΈ Image Results", height=400)
217
+
218
+ yield tuple(outputs)
219
+
220
+ except Exception as e:
221
+ yield handle_error(e, req, res)
222
+
223
+
224
+ # --- πŸ–ΌοΈ UI Generation ---
225
+ def create_dataset_tab(dataset_key: str):
226
+ config = DATASET_CONFIG[dataset_key]
227
+
228
+ with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
229
+ gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
230
+ if not config['is_public']:
231
+ gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
232
+
233
+ with gr.Row():
234
+ access_method = gr.Radio(config['methods'], label="πŸ”‘ Access Method", value=config['methods'][0])
235
+ query = gr.Textbox(label="πŸ” Search Query", placeholder="Enter any text to search, or leave blank for samples...")
236
+
237
+ fetch_button = gr.Button("πŸš€ Go Fetch!")
238
+ status_output = gr.Markdown("🏁 Ready to search.")
239
+ df_output = gr.DataFrame(label="πŸ“Š Results DataFrame", interactive=False, wrap=True)
240
+ gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="πŸ–ΌοΈ Image Results")
241
+
242
+ with gr.Accordion("πŸ“‚ View/Export Full Results", open=False):
243
+ markdown_output = gr.Markdown(label="πŸ“ Markdown View")
244
+ with gr.Row():
245
+ csv_output = gr.File(label="⬇️ Download CSV")
246
+ xlsx_output = gr.File(label="⬇️ Download XLSX")
247
+ copy_output = gr.Code(label="πŸ“‹ Copy-Paste (Tab-Delimited)")
248
+
249
+ code_output = gr.Code(label="πŸ’» Python Code Snippet", language="python")
250
+
251
+ # --- ✨ NEW Debug Log UI Component (language parameter removed) ---
252
+ debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
253
+
254
+ fetch_button.click(
255
+ fn=fetch_data,
256
+ inputs=[gr.State(dataset_key), access_method, query],
257
+ outputs=[
258
+ df_output, gallery_output, status_output, markdown_output,
259
+ csv_output, xlsx_output, copy_output, code_output,
260
+ debug_log_output # Add the new output here
261
+ ]
262
+ )
263
+
264
+ # --- πŸš€ Main App ---
265
+ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
266
+ gr.Markdown("# πŸ€— Hugging Face Dataset Explorer")
267
+ gr.Markdown(
268
+ "Select a dataset, choose an access method, and type a query. "
269
+ "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
270
+ )
271
+ with gr.Tabs():
272
+ for key in DATASET_CONFIG.keys():
273
+ create_dataset_tab(key)
274
+
275
+ if __name__ == "__main__":
276
+ demo.launch(debug=True)