# import gradio as gr # import polars as pl # # Path for the combined Parquet file # COMBINED_PARQUET_PATH = "datasetcards.parquet" # ROWS_PER_PAGE = 50 # # Lazy load dataset # lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH) # lazy_df = lazy_df.sort( # by=["downloads", "last_modified"], # descending=[True, True] # ) # # Helper function to fetch a page # def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""): # filtered_df = lazy_df # if column and query: # query_lower = query.lower().strip() # filtered_df = filtered_df.with_columns([ # pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column) # ]).filter(pl.col(column).str.contains(query_lower, literal=False)) # start = page * ROWS_PER_PAGE # page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas() # # Replace NaN/None with empty string for display # page_df = page_df.fillna("") # total_rows = filtered_df.collect().height # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 # return page_df, total_pages # # Initialize first page # initial_df, total_pages = get_page(lazy_df, 0) # columns = list(initial_df.columns) # with gr.Blocks() as demo: # gr.Markdown("## Dataset Insight Portal") # gr.Markdown("This space allows you to explore the dataset of DatasetCards.
" # "You can navigate pages, search within columns, and inspect the dataset easily.
" # ) # # Pagination controls # with gr.Row(): # prev_btn = gr.Button("Previous", elem_id="small-btn") # next_btn = gr.Button("Next", elem_id="small-btn") # page_number = gr.Number(value=0, label="Page", precision=0) # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # # Data table # data_table = gr.Dataframe( # value=initial_df, headers=columns, datatype="str", # interactive=False, row_count=ROWS_PER_PAGE # ) # # Column search # with gr.Row(): # col_dropdown = gr.Dropdown(choices=columns, label="Column") # search_text = gr.Textbox(label="Search") # search_btn = gr.Button("Search", elem_id="small-btn") # reset_btn = gr.Button("Reset", elem_id="small-btn") # # --- Functions --- # current_lazy_df = lazy_df # single dataset # def next_page_func(page, column, query): # page += 1 # page_df, total_pages = get_page(current_lazy_df, page, column, query) # if page >= total_pages: # page = total_pages - 1 # page_df, total_pages = get_page(current_lazy_df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def prev_page_func(page, column, query): # page -= 1 # page = max(0, page) # page_df, total_pages = get_page(current_lazy_df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def search_func(column, query): # page_df, total_pages = get_page(current_lazy_df, 0, column, query) # return page_df, f"Total Pages: {total_pages}", 0 # def reset_func(): # page_df, total_pages = get_page(current_lazy_df, 0) # return page_df, f"Total Pages: {total_pages}", 0 # # --- Event Listeners --- # next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) # prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) # search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number]) # reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number]) # demo.launch() # import gradio as gr # import polars as pl # COMBINED_PARQUET_PATH = "datasetcards.parquet" # ROWS_PER_PAGE = 50 # # Load dataset # df = pl.read_parquet(COMBINED_PARQUET_PATH) # eager DataFrame # # Columns with dropdown instead of text search # DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword"] # # Get unique values for the dropdown columns # unique_values = { # col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS # } # # Get page helper # def get_page(df, page, column, query): # filtered_df = df # if column and query: # if column in DROPDOWN_COLUMNS: # # Exact match from dropdown # filtered_df = filtered_df.filter(pl.col(column) == query) # else: # # Text search # q = query.lower().strip() # filtered_df = ( # filtered_df.with_columns([ # pl.col(column).str.to_lowercase().alias(column) # ]) # .filter(pl.col(column).str.contains(q, literal=False)) # ) # start = page * ROWS_PER_PAGE # page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("") # total_rows = filtered_df.height # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1 # return page_df, total_pages # # Initial page # initial_df, total_pages = get_page(df, 0, None, "") # columns = list(initial_df.columns) # # Build Gradio app # with gr.Blocks() as demo: # gr.Markdown("## Dataset Insight Portal") # gr.Markdown( # "This space allows you to explore the dataset of DatasetCards.
" # "You can navigate pages, search within columns, and inspect the dataset easily.
" # ) # with gr.Row(): # prev_btn = gr.Button("Previous") # next_btn = gr.Button("Next") # page_number = gr.Number(value=0, label="Page", precision=0) # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # data_table = gr.Dataframe( # value=initial_df, # headers=columns, # datatype="str", # interactive=False, # row_count=ROWS_PER_PAGE, # ) # with gr.Row(): # col_dropdown = gr.Dropdown(choices=columns, label="Column to Search") # search_text = gr.Textbox(label="Search Text") # search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False) # search_btn = gr.Button("Search") # reset_btn = gr.Button("Reset") # # Show dropdown only for certain columns # def update_search_input(column): # if column in DROPDOWN_COLUMNS: # return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False) # else: # return gr.update(visible=False), gr.update(visible=True) # col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text]) # # Search function # def search_func(page, column, txt, ddl): # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", 0 # def next_page(page, column, txt, ddl): # page += 1 # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # if page >= total_pages: # page = total_pages - 1 # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def prev_page(page, column, txt, ddl): # page = max(0, page - 1) # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page # def reset_func(): # page_df, total_pages = get_page(df, 0, None, "") # return page_df, f"Total Pages: {total_pages}", 0, "", "" # # Wire events # inputs = [page_number, col_dropdown, search_text, search_dropdown] # outputs = [data_table, total_pages_display, page_number] # search_btn.click(search_func, inputs, outputs) # next_btn.click(next_page, inputs, outputs) # prev_btn.click(prev_page, inputs, outputs) # reset_btn.click(reset_func, [], outputs + [search_text, search_dropdown]) # demo.launch() # import gradio as gr # import polars as pl # from huggingface_hub import HfApi # import re # # --- Hugging Face Org --- # org_name = "hugging-science" # api = HfApi() # def fetch_members(): # members = api.list_organization_members(org_name) # return [member.username for member in members] # member_list = fetch_members() # # --- Dataset --- # COMBINED_PARQUET_PATH = "datasetcards_new.parquet" # UPDATED_PARQUET_PATH = "datasetcards_new.parquet" # ROWS_PER_PAGE = 50 # # df = pl.read_parquet(COMBINED_PARQUET_PATH) # df = pl.read_parquet(COMBINED_PARQUET_PATH) # df = df.with_columns([ # pl.lit("todo").alias("status"), # pl.lit("").alias("assigned_to") # ]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True]) # if "reason" in df.columns: # df = df.with_columns([ # pl.Series( # "reason", # ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]] # ) # ]) # # Add editable columns if missing # for col in ["assigned_to", "status"]: # if col not in df.columns: # default_val = "" if col == "assigned_to" else "todo" # df = df.with_columns(pl.lit(default_val).alias(col)) # else: # # Fill nulls with default # default_val = "" if col == "assigned_to" else "todo" # df = df.with_columns(pl.col(col).fill_null(default_val)) # # --- Columns --- # DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"] # STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"] # # Prepare unique values for dropdown search # unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS} # unique_values['assigned_to'] = sorted(member_list) # unique_values['status'] = STATUS_OPTIONS # # --- Helper to get page --- # def get_page(df, page, column=None, query=None): # filtered_df = df # if column and query: # if column in DROPDOWN_COLUMNS: # filtered_df = filtered_df.filter(pl.col(column) == query) # else: # q = query.lower().strip() # filtered_df = ( # filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)]) # .filter(pl.col(column).str.contains(q, literal=False)) # ) # start = page * ROWS_PER_PAGE # page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("") # total_rows = filtered_df.height # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1 # return page_df, total_pages # initial_df, total_pages = get_page(df, 0) # columns = list(initial_df.columns) # with gr.Blocks() as demo: # gr.Markdown(""" # # Dataset Insight Portal # Welcome! This portal helps you explore and manage datasets from our Hugging Face organization. # ## What is this space for? # This space provides a table of datasets along with metadata. You can: # - Browse datasets with pagination. # - Search datasets by various fields. # - Assign responsibility for reviewing datasets (`assigned_to`). # - Track progress using `status`. # ## Why the table? # The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025. # ## What does the table contain? # Each row represents a dataset. Columns include: # - **dataset_id**: Unique identifier of the dataset. # - **dataset_url**: Link to the dataset page on Hugging Face. # - **downloads**: Number of downloads. # - **author**: Dataset author. # - **license**: License type. # - **tags**: Tags describing the dataset. Obtained from the dataset card. # - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card. # - **last_modified**: Date of last update. # - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets. # - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below). # - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`. # - **usedStorage**: Storage used by the dataset (bytes). # - **assigned_to**: Person responsible for the dataset (editable). # - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`. # ## How to use search # - Select a **column** from the dropdown. # - If the column is textual, type your query in the text box. # - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown. # - Click **Search** to filter the table. # ## How to add or update `assigned_to` and `status` # 1. Search for the **dataset_id** initially. # 2. Then, select the **dataset_id** from the dropdown below the table. # 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again. # 4. Select the current status in **Status**. # 5. Click **Save Changes** to update the table and persist the changes. # 6. Use **Refresh All** to reload the table and the latest members list. # This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place. # """) # # --- Pagination controls --- # with gr.Row(): # prev_btn = gr.Button("Previous") # next_btn = gr.Button("Next") # page_number = gr.Number(value=0, label="Page", precision=0) # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # # --- Data table --- # data_table = gr.Dataframe( # value=initial_df, # headers=columns, # datatype="str", # interactive=False, # row_count=ROWS_PER_PAGE # ) # # --- Search controls --- # with gr.Row(): # col_dropdown = gr.Dropdown(choices=columns, label="Column to Search") # search_text = gr.Textbox(label="Search Text") # search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False) # search_btn = gr.Button("Search") # reset_btn = gr.Button("Reset") # # --- Dataset selection & editable fields --- # selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist()) # assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To") # # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status") # status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo") # save_btn = gr.Button("Save Changes") # refresh_btn = gr.Button("Refresh All") # save_message = gr.Textbox(label="Save Status", interactive=False) # # --- Update search input depending on column --- # def update_search_input(column): # if column in DROPDOWN_COLUMNS: # return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False) # else: # return gr.update(visible=False), gr.update(visible=True) # col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text]) # # --- Prefill editable fields --- # def prefill_fields(dataset_id): # if not dataset_id: # return "", "todo" # dataset_id = str(dataset_id) # filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id] # if not filtered: # return "", "todo" # row = filtered[0] # return row.get("assigned_to", ""), row.get("status", "todo") # selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input]) # # --- Search function --- # def search_func(page, column, txt, ddl): # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) # # --- Pagination functions --- # def next_page(page, column, txt, ddl): # page += 1 # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # if page >= total_pages: # page = total_pages - 1 # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) # def prev_page(page, column, txt, ddl): # page = max(0, page - 1) # query = ddl if column in DROPDOWN_COLUMNS else txt # page_df, total_pages = get_page(df, page, column, query) # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) # def reset_func(): # page_df, total_pages = get_page(df, 0) # return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) # # --- Save changes & refresh --- # def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl): # global df # if not dataset_id: # return gr.update(value="Please select a row first."), None, None, None # df = df.with_columns([ # pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"), # pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status") # ]) # df.write_parquet(UPDATED_PARQUET_PATH) # page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl) # return ( # gr.update(value=f"Saved changes for dataset_id: {dataset_id}"), # page_df, # gr.update(choices=page_df['dataset_id'].tolist()), # f"Total Pages: {total_pages}" # ) # # --- Refresh All: table + members --- # def refresh_all(page, column, txt, ddl): # global df, member_list, unique_values # # Refresh members # member_list = fetch_members() # unique_values['assigned_to'] = sorted(member_list) # # Refresh table # try: # df = pl.read_parquet(UPDATED_PARQUET_PATH) # except FileNotFoundError: # pass # page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl) # return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list) # # --- Wire buttons --- # inputs_search = [page_number, col_dropdown, search_text, search_dropdown] # outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id] # search_btn.click(search_func, inputs_search, outputs_search) # next_btn.click(next_page, inputs_search, outputs_search) # prev_btn.click(prev_page, inputs_search, outputs_search) # reset_btn.click(reset_func, [], outputs_search) # save_btn.click( # save_changes, # [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown], # [save_message, data_table, selected_dataset_id, total_pages_display] # ) # refresh_btn.click( # refresh_all, # inputs=[page_number, col_dropdown, search_text, search_dropdown], # outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input] # ) # demo.launch() import gradio as gr import polars as pl import threading import time import subprocess import os from huggingface_hub import HfApi # --- Config --- org_name = "hugging-science" PARQUET_PATH = "datasetcards_new.parquet" ROWS_PER_PAGE = 50 STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"] DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"] api = HfApi() # --- Helpers --- def fetch_members(): return [m.username for m in api.list_organization_members(org_name)] member_list = fetch_members() # Load dataset df = pl.read_parquet(PARQUET_PATH) for col, default in [("assigned_to",""),("status","todo")]: if col not in df.columns: df = df.with_columns(pl.lit(default).alias(col)) else: df = df.with_columns(pl.col(col).fill_null(default)) df = df.sort(by=["downloads","last_modified","usedStorage"], descending=[True,True,True]) if "reason" in df.columns: df = df.with_columns([ pl.Series( "reason", ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]] ) ]) # Prepare dropdown unique values unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS} unique_values["assigned_to"] = sorted(member_list) unique_values["status"] = STATUS_OPTIONS # Pagination / filtering def get_page(df, page, column=None, query=None): filtered = df if column and query: if column in DROPDOWN_COLUMNS: filtered = filtered.filter(pl.col(column) == query) else: filtered = df.filter(pl.col(column).str.to_lowercase().str.contains(query.strip().lower(), literal=True)) start = page*ROWS_PER_PAGE page_df = filtered[start:start+ROWS_PER_PAGE].to_pandas().fillna("") total_rows = filtered.height total_pages = (total_rows-1)//ROWS_PER_PAGE+1 if total_rows>0 else 1 return page_df, total_pages initial_df, total_pages = get_page(df, 0) columns = list(initial_df.columns) # --- Gradio App --- with gr.Blocks() as demo: gr.Markdown(""" # Dataset Insight Portal Welcome! This portal helps you explore and manage datasets from our Hugging Face organization. ## What is this space for? This space provides a table of datasets along with metadata. You can: - Browse datasets with pagination. - Search datasets by various fields. - Assign responsibility for reviewing datasets (`assigned_to`). - Track progress using `status`. - update the parquet file and push to git automatically every 20mins. So if you see restarting/building pls wait for 5mins. ## Why the table? The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025. ## What does the table contain? Each row represents a dataset. Columns include: - **dataset_id**: Unique identifier of the dataset. - **dataset_url**: Link to the dataset page on Hugging Face. - **downloads**: Number of downloads. - **author**: Dataset author. - **license**: License type. - **tags**: Tags describing the dataset. Obtained from the dataset card. - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card. - **last_modified**: Date of last update. - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets. - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below). - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`. - **usedStorage**: Storage used by the dataset (bytes). - **assigned_to**: Person responsible for the dataset (editable). - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`. ## How to use search - Select a **column** from the dropdown. - If the column is textual, type your query in the text box. - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown. - Click **Search** to filter the table. ## How to add or update `assigned_to` and `status` 1. Search for the **dataset_id** initially. 2. Then, select the **dataset_id** from the dropdown below the table. 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again. 4. Select the current status in **Status**. 5. Click **Save Changes** to update the table and persist the changes. 6. Use **Refresh All** to reload the table and the latest members list. This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place. """) # Pagination with gr.Row(): prev_btn = gr.Button("Previous") next_btn = gr.Button("Next") page_number = gr.Number(value=0, label="Page", precision=0) total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") # Data table data_table = gr.Dataframe( value=initial_df, headers=columns, datatype="str", interactive=False, row_count=ROWS_PER_PAGE ) # Search controls with gr.Row(): col_dropdown = gr.Dropdown(choices=columns, label="Column to Search") search_text = gr.Textbox(label="Search Text") search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False) search_btn = gr.Button("Search") reset_btn = gr.Button("Reset") # Dataset edit selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist()) assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To") status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo") save_btn = gr.Button("Save Changes") refresh_btn = gr.Button("Refresh All") save_message = gr.Textbox(label="Save Status", interactive=False) # --- Callbacks --- def update_search_input(col): if col in DROPDOWN_COLUMNS: return gr.update(choices=unique_values[col], visible=True), gr.update(visible=False) return gr.update(visible=False), gr.update(visible=True) col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text]) def prefill_fields(dataset_id): if not dataset_id: return "", "todo" row = df.filter(pl.col("dataset_id") == dataset_id) if row.height==0: return "", "todo" return row[0,"assigned_to"], row[0,"status"] selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input]) # --- Search & Pagination --- def search_func(page, col, txt, ddl): query = ddl if col in DROPDOWN_COLUMNS else txt page_df, total_pages = get_page(df, page, col, query) return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) def next_page(page, col, txt, ddl): page += 1 query = ddl if col in DROPDOWN_COLUMNS else txt page_df, total_pages = get_page(df, page, col, query) if page>=total_pages: page=total_pages-1 page_df, _ = get_page(df, page, col, query) return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) def prev_page(page, col, txt, ddl): page = max(0,page-1) query = ddl if col in DROPDOWN_COLUMNS else txt page_df, total_pages = get_page(df, page, col, query) return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()) def reset_func(): page_df, total_pages = get_page(df,0) return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist()) # --- Save & Refresh --- def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl): global df if not dataset_id: return gr.update(value="Select a dataset first."), None, None, None df = df.with_columns([ pl.when(pl.col("dataset_id")==dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"), pl.when(pl.col("dataset_id")==dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status") ]) df.write_parquet(PARQUET_PATH) page_df, total_pages = get_page(df,page_val,col,txt if col not in DROPDOWN_COLUMNS else ddl) return gr.update(value=f"Saved dataset_id: {dataset_id}"), page_df, gr.update(choices=page_df['dataset_id'].tolist()), f"Total Pages: {total_pages}" def refresh_all(page, col, txt, ddl): global df, member_list, unique_values member_list = fetch_members() unique_values["assigned_to"] = sorted(member_list) page_df, total_pages = get_page(df,page,col,txt if col not in DROPDOWN_COLUMNS else ddl) return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list) # --- Wire buttons --- inputs_search = [page_number, col_dropdown, search_text, search_dropdown] outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id] search_btn.click(search_func, inputs_search, outputs_search) next_btn.click(next_page, inputs_search, outputs_search) prev_btn.click(prev_page, inputs_search, outputs_search) reset_btn.click(reset_func, [], outputs_search) save_btn.click( save_changes, [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown], [save_message, data_table, selected_dataset_id, total_pages_display] ) refresh_btn.click( refresh_all, [page_number, col_dropdown, search_text, search_dropdown], [data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input] ) # --- Auto-push --- def auto_push_loop(interval=1200): while True: try: hf_token = os.environ.get("HF_TOKEN") if not hf_token: time.sleep(interval) continue repo_url = f"https://user:{hf_token}@huggingface.co/spaces/{org_name}/dataset-insight-portal" subprocess.run(["git", "config", "--global", "user.email", "santosh9sanjeev@gmail.com"], check=True) subprocess.run(["git", "config", "--global", "user.name", "Santosh Sanjeev"], check=True) subprocess.run(["git", "remote", "set-url", "origin", repo_url], check=True) subprocess.run(["git", "add", PARQUET_PATH], check=True) result = subprocess.run(["git", "diff", "--cached","--quiet"]) if result.returncode !=0: subprocess.run(["git","commit","-m","Auto-update parquet file"], check=True) subprocess.run(["git","push","origin","main"], check=True) print("✅ Parquet pushed") except Exception as e: print("⚠️ Push failed:", e) time.sleep(interval) threading.Thread(target=auto_push_loop, args=(1200,), daemon=True).start() demo.launch()