import os
os.environ["MPLCONFIGDIR"] = "/tmp/matplotlib_cache"

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

st.title("📊 Parquet Data Explorer")

# Login using e.g. `huggingface-cli login` to access this dataset
#df = pd.read_parquet("hf://datasets/Daniela-C/Yellow_tripdata_2025/yellow_tripdata_2025-06.parquet")

from datasets import load_dataset

# Load directly from the HF Dataset Hub
dataset = load_dataset("Daniela-C/Yellow_tripdata_2025", split="train")
# or load the separate splits if the dataset has train/validation/test splits
train_dataset = load_dataset("Daniela-C/Yellow_tripdata_2025", split="train")
valid_dataset = load_dataset("Daniela-C/Yellow_tripdata_2025", split="validation")
test_dataset  = load_dataset("Daniela-C/Yellow_tripdata_2025", split="test")

from datasets import Dataset

df = load_data()

st.write("Sample of Your Data:")
st.dataframe(df.head())

# Visualize numeric column if it exists
numeric_cols = df.select_dtypes(include='number').columns
if len(numeric_cols) > 0:
    selected_col = st.selectbox("Choose a numeric column to plot:", numeric_cols)
    st.write(f"### Histogram for `{selected_col}`")
    fig, ax = plt.subplots()
    df[selected_col].hist(bins=30, ax=ax)
    st.pyplot(fig)
else:
    st.warning("No numeric columns found in your dataset.")