File size: 2,432 Bytes
fe70f39
 
ac6d75c
fe70f39
 
 
 
 
3376857
 
 
 
fe70f39
 
 
 
 
 
3376857
fe70f39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Streamlit App: NYC Taxi Anomaly Detector with Event Markers

pip install --upgrade pip
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from datasets import load_dataset
dataset = load_dataset("Daniela-C/Yellow_tripdata_2025")
df = dataset["train"].to_pandas()

st.set_page_config(page_title="NYC Taxi Anomaly Detector", layout="wide")
st.title("πŸš• NYC Taxi Passenger Count - Anomaly Detection")

# Load data
@st.cache_data
def load_data():
    df = pd.read_csv('Yellow_tripdata_2025', parse_dates=['tpep_pickup_datetime'], index_col='tpep_pickup_datetime')
    return df

df = load_data()

# Define NYC events/holidays
events = {
    "New Year\'s Eve": "2015-12-31",
    "New Year\'s Day": "2016-01-01",
    "Martin Luther King Jr. Day": "2016-01-18"
}

# Sidebar controls
st.sidebar.header("Filters")
start_date = st.sidebar.date_input("Start Date", df.index.min().date())
end_date = st.sidebar.date_input("End Date", df.index.max().date())
threshold_slider = st.sidebar.slider("Anomaly Threshold (%)", 90, 99, 95)

# Filtered data
filtered_df = df.loc[str(start_date):str(end_date)]

# Apply new threshold
new_threshold = filtered_df['reconstruction_error'].quantile(threshold_slider / 100.0)
filtered_df['anomaly_custom'] = filtered_df['reconstruction_error'] > new_threshold

# Plot
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(filtered_df.index, filtered_df['passenger_count'], label='Passenger Count')
ax.scatter(filtered_df[filtered_df['anomaly_custom']].index,
           filtered_df[filtered_df['anomaly_custom']]['passenger_count'],
           color='red', label='Anomaly')

# Add event markers
for name, date_str in events.items():
    event_date = pd.to_datetime(date_str)
    if event_date in filtered_df.index:
        ax.axvline(event_date, color='orange', linestyle='--', alpha=0.7)
        ax.text(event_date, ax.get_ylim()[1]*0.9, name, rotation=90, color='orange', fontsize=8)

ax.set_title('Anomaly Detection with NYC Event Markers')
ax.legend()
ax.set_xlabel("Date")
ax.set_ylabel("Passenger Count")
plt.xticks(rotation=45)
st.pyplot(fig)

# Show data table
with st.expander("πŸ“„ View Data Table"):
    st.dataframe(filtered_df[['passenger_count', 'reconstruction_error', 'anomaly_custom']])

# Download
st.download_button("Download Anomalies CSV", data=filtered_df.to_csv().encode('utf-8'), file_name="filtered_anomalies.csv", mime="text/csv")