import streamlit as st st.set_page_config(page_title="WhatsApp Chat Analyzer", layout="wide") import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import preprocessor, helper from sentiment import predict_sentiment_batch import os os.environ["STREAMLIT_SERVER_RUN_ON_SAVE"] = "false" # Theme customization st.markdown( """ """, unsafe_allow_html=True ) # Set seaborn style sns.set_theme(style="whitegrid") st.title("📊 WhatsApp Chat Sentiment Analysis Dashboard") st.subheader('Instructions') st.markdown("1. Open the sidebar and upload your WhatsApp chat file in .txt format.") st.markdown("2. Wait for the initial processing (minimal delay).") st.markdown("3. Customize the analysis by selecting users or filters.") st.markdown("4. Click 'Show Analysis' for detailed results.") st.sidebar.title("Whatsapp Chat Analyzer") uploaded_file = st.sidebar.file_uploader("Upload your chat file (.txt)", type="txt") @st.cache_data def load_and_preprocess(file_content): return preprocessor.preprocess(file_content) if uploaded_file is not None: raw_data = uploaded_file.read().decode("utf-8") with st.spinner("Loading chat data..."): df, _ = load_and_preprocess(raw_data) st.session_state.df = df st.sidebar.header("🔍 Filters") user_list = ["Overall"] + sorted(df["user"].unique().tolist()) selected_user = st.sidebar.selectbox("Select User", user_list) df_filtered = df if selected_user == "Overall" else df[df["user"] == selected_user] if st.sidebar.button("Show Analysis"): if df_filtered.empty: st.warning(f"No data found for user: {selected_user}") else: with st.spinner("Analyzing..."): if 'sentiment' not in df_filtered.columns: try: print("Starting sentiment analysis...") # Get messages as clean strings message_list = df_filtered["message"].astype(str).tolist() message_list = [msg for msg in message_list if msg.strip()] print(f"Processing {len(message_list)} messages") print(f"Sample messages: {message_list[:5]}") # Directly call the sentiment analysis function df_filtered['sentiment'] = predict_sentiment_batch(message_list) print("Sentiment analysis completed successfully") except Exception as e: st.error(f"Sentiment analysis failed: {str(e)}") print(f"Full error: {str(e)}") st.session_state.df_filtered = df_filtered else: st.session_state.df_filtered = df_filtered # Display statistics and visualizations num_messages, words, num_media, num_links = helper.fetch_stats(selected_user, df_filtered) st.title("Top Statistics") col1, col2, col3, col4 = st.columns(4) with col1: st.header("Total Messages") st.title(num_messages) with col2: st.header("Total Words") st.title(words) with col3: st.header("Media Shared") st.title(num_media) with col4: st.header("Links Shared") st.title(num_links) st.title("Monthly Timeline") timeline = helper.monthly_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) if not timeline.empty: plt.figure(figsize=(10, 5)) sns.lineplot(data=timeline, x='time', y='message', color='green') plt.title("Monthly Timeline") plt.xlabel("Date") plt.ylabel("Messages") st.pyplot(plt) plt.clf() st.title("Daily Timeline") daily_timeline = helper.daily_timeline(selected_user, df_filtered.sample(min(5000, len(df_filtered)))) if not daily_timeline.empty: plt.figure(figsize=(10, 5)) sns.lineplot(data=daily_timeline, x='date', y='message', color='black') plt.title("Daily Timeline") plt.xlabel("Date") plt.ylabel("Messages") st.pyplot(plt) plt.clf() st.title("Activity Map") col1, col2 = st.columns(2) with col1: st.header("Most Busy Day") busy_day = helper.week_activity_map(selected_user, df_filtered) if not busy_day.empty: plt.figure(figsize=(10, 5)) sns.barplot(x=busy_day.index, y=busy_day.values, palette="Purples_r") plt.title("Most Busy Day") plt.xlabel("Day of Week") plt.ylabel("Message Count") st.pyplot(plt) plt.clf() with col2: st.header("Most Busy Month") busy_month = helper.month_activity_map(selected_user, df_filtered) if not busy_month.empty: plt.figure(figsize=(10, 5)) sns.barplot(x=busy_month.index, y=busy_month.values, palette="Oranges_r") plt.title("Most Busy Month") plt.xlabel("Month") plt.ylabel("Message Count") st.pyplot(plt) plt.clf() if selected_user == 'Overall': st.title("Most Busy Users") x, new_df = helper.most_busy_users(df_filtered) if not x.empty: plt.figure(figsize=(10, 5)) sns.barplot(x=x.index, y=x.values, palette="Reds_r") plt.title("Most Busy Users") plt.xlabel("User") plt.ylabel("Message Count") plt.xticks(rotation=45) st.pyplot(plt) st.title("Word Count by User") plt.clf() st.dataframe(new_df) # Most common words analysis st.title("Most Common Words") most_common_df = helper.most_common_words(selected_user, df_filtered) if not most_common_df.empty: fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(y=most_common_df[0], x=most_common_df[1], ax=ax, palette="Blues_r") ax.set_title("Top 20 Most Common Words") ax.set_xlabel("Frequency") ax.set_ylabel("Words") plt.xticks(rotation='vertical') st.pyplot(fig) plt.clf() else: st.warning("No data available for most common words.") # Emoji analysis st.title("Emoji Analysis") emoji_df = helper.emoji_helper(selected_user, df_filtered) if not emoji_df.empty: col1, col2 = st.columns(2) with col1: st.subheader("Top Emojis Used") st.dataframe(emoji_df) with col2: fig, ax = plt.subplots(figsize=(8, 8)) ax.pie(emoji_df[1].head(), labels=emoji_df[0].head(), autopct="%0.2f%%", startangle=90, colors=sns.color_palette("pastel")) ax.set_title("Top Emoji Distribution") st.pyplot(fig) plt.clf() else: st.warning("No data available for emoji analysis.") # Sentiment Analysis Visualizations st.title("📈 Sentiment Analysis") # Convert month names to abbreviated format month_map = { 'January': 'Jan', 'February': 'Feb', 'March': 'Mar', 'April': 'Apr', 'May': 'May', 'June': 'Jun', 'July': 'Jul', 'August': 'Aug', 'September': 'Sep', 'October': 'Oct', 'November': 'Nov', 'December': 'Dec' } df_filtered['month'] = df_filtered['month'].map(month_map) # Group by month and sentiment monthly_sentiment = df_filtered.groupby(['month', 'sentiment']).size().unstack(fill_value=0) # Plotting: Histogram (Bar Chart) for each sentiment st.write("### Sentiment Count by Month (Histogram)") # Create a figure with subplots for each sentiment fig, axes = plt.subplots(1, 3, figsize=(18, 5)) # Plot Positive Sentiment if 'positive' in monthly_sentiment: axes[0].bar(monthly_sentiment.index, monthly_sentiment['positive'], color='green') axes[0].set_title('Positive Sentiment') axes[0].set_xlabel('Month') axes[0].set_ylabel('Count') # Plot Neutral Sentiment if 'neutral' in monthly_sentiment: axes[1].bar(monthly_sentiment.index, monthly_sentiment['neutral'], color='blue') axes[1].set_title('Neutral Sentiment') axes[1].set_xlabel('Month') axes[1].set_ylabel('Count') # Plot Negative Sentiment if 'negative' in monthly_sentiment: axes[2].bar(monthly_sentiment.index, monthly_sentiment['negative'], color='red') axes[2].set_title('Negative Sentiment') axes[2].set_xlabel('Month') axes[2].set_ylabel('Count') # Display the plots in Streamlit st.pyplot(fig) plt.clf() # Count sentiments per day of the week sentiment_counts = df_filtered.groupby(['day_of_week', 'sentiment']).size().unstack(fill_value=0) # Sort days correctly day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] sentiment_counts = sentiment_counts.reindex(day_order) # Daily Sentiment Analysis st.write("### Daily Sentiment Analysis") # Create a Matplotlib figure fig, ax = plt.subplots(figsize=(10, 5)) sentiment_counts.plot(kind='bar', stacked=False, ax=ax, color=['red', 'blue', 'green']) # Customize the plot ax.set_xlabel("Day of the Week") ax.set_ylabel("Count") ax.set_title("Sentiment Distribution per Day of the Week") ax.legend(title="Sentiment") # Display the plot in Streamlit st.pyplot(fig) plt.clf() # Count messages per user per sentiment (only for Overall view) if selected_user == 'Overall': sentiment_counts = df_filtered.groupby(['user', 'sentiment']).size().reset_index(name='Count') # Calculate total messages per sentiment total_per_sentiment = df_filtered['sentiment'].value_counts().to_dict() # Add percentage column sentiment_counts['Percentage'] = sentiment_counts.apply( lambda row: (row['Count'] / total_per_sentiment[row['sentiment']]) * 100, axis=1 ) # Separate tables for each sentiment positive_df = sentiment_counts[sentiment_counts['sentiment'] == 'positive'].sort_values(by='Count', ascending=False).head(10) neutral_df = sentiment_counts[sentiment_counts['sentiment'] == 'neutral'].sort_values(by='Count', ascending=False).head(10) negative_df = sentiment_counts[sentiment_counts['sentiment'] == 'negative'].sort_values(by='Count', ascending=False).head(10) # Sentiment Contribution Analysis st.write("### Sentiment Contribution by User") # Create three columns for side-by-side display col1, col2, col3 = st.columns(3) # Display Positive Table with col1: st.subheader("Top Positive Contributors") if not positive_df.empty: st.dataframe(positive_df[['user', 'Count', 'Percentage']]) else: st.warning("No positive sentiment data") # Display Neutral Table with col2: st.subheader("Top Neutral Contributors") if not neutral_df.empty: st.dataframe(neutral_df[['user', 'Count', 'Percentage']]) else: st.warning("No neutral sentiment data") # Display Negative Table with col3: st.subheader("Top Negative Contributors") if not negative_df.empty: st.dataframe(negative_df[['user', 'Count', 'Percentage']]) else: st.warning("No negative sentiment data") # Topic Analysis Section st.title("🔍 Area of Focus: Topic Analysis") # Check if topic column exists, otherwise perform topic modeling # if 'topic' not in df_filtered.columns: # with st.spinner("Performing topic modeling..."): # try: # # Add topic modeling here or ensure your helper functions handle it # df_filtered = helper.perform_topic_modeling(df_filtered) # except Exception as e: # st.error(f"Topic modeling failed: {str(e)}") # st.stop() # Plot Topic Distribution st.header("Topic Distribution") try: fig = helper.plot_topic_distribution(df_filtered) st.pyplot(fig) plt.clf() except Exception as e: st.warning(f"Could not display topic distribution: {str(e)}") # Display Sample Messages for Each Topic st.header("Sample Messages for Each Topic") if 'topic' in df_filtered.columns: for topic_id in sorted(df_filtered['topic'].unique()): st.subheader(f"Topic {topic_id}") # Get messages for the current topic filtered_messages = df_filtered[df_filtered['topic'] == topic_id]['message'] # Determine sample size sample_size = min(5, len(filtered_messages)) if sample_size > 0: sample_messages = filtered_messages.sample(sample_size, replace=False).tolist() for msg in sample_messages: st.write(f"- {msg}") else: st.write("No messages available for this topic.") else: st.warning("Topic information not available") # Topic Distribution Over Time st.header("📅 Topic Trends Over Time") # Add time frequency selector time_freq = st.selectbox("Select Time Frequency", ["Daily", "Weekly", "Monthly"], key='time_freq') # Plot topic trends try: freq_map = {"Daily": "D", "Weekly": "W", "Monthly": "M"} topic_distribution = helper.topic_distribution_over_time(df_filtered, time_freq=freq_map[time_freq]) # Choose between static and interactive plot use_plotly = st.checkbox("Use interactive visualization", value=True, key='use_plotly') if use_plotly: fig = helper.plot_topic_distribution_over_time_plotly(topic_distribution) st.plotly_chart(fig, use_container_width=True) else: fig = helper.plot_topic_distribution_over_time(topic_distribution) st.pyplot(fig) plt.clf() except Exception as e: st.warning(f"Could not display topic trends: {str(e)}") # Clustering Analysis Section st.title("🧩 Conversation Clusters") # Number of clusters input n_clusters = st.slider("Select number of clusters", min_value=2, max_value=10, value=5, key='n_clusters') # Perform clustering with st.spinner("Analyzing conversation clusters..."): try: df_clustered, reduced_features, _ = preprocessor.preprocess_for_clustering(df_filtered, n_clusters=n_clusters) # Plot clusters st.header("Cluster Visualization") fig = helper.plot_clusters(reduced_features, df_clustered['cluster']) st.pyplot(fig) plt.clf() # Cluster Insights st.header("📌 Cluster Insights") # 1. Dominant Conversation Themes st.subheader("1. Dominant Themes") cluster_labels = helper.get_cluster_labels(df_clustered, n_clusters) for cluster_id, label in cluster_labels.items(): st.write(f"**Cluster {cluster_id}**: {label}") # 2. Temporal Patterns st.subheader("2. Temporal Patterns") temporal_trends = helper.get_temporal_trends(df_clustered) for cluster_id, trend in temporal_trends.items(): st.write(f"**Cluster {cluster_id}**: Peaks on {trend['peak_day']} around {trend['peak_time']}") # 3. User Contributions if selected_user == 'Overall': st.subheader("3. Top Contributors") user_contributions = helper.get_user_contributions(df_clustered) for cluster_id, users in user_contributions.items(): st.write(f"**Cluster {cluster_id}**: {', '.join(users[:3])}...") # 4. Sentiment by Cluster st.subheader("4. Sentiment Analysis") sentiment_by_cluster = helper.get_sentiment_by_cluster(df_clustered) for cluster_id, sentiment in sentiment_by_cluster.items(): st.write(f"**Cluster {cluster_id}**: {sentiment['positive']}% positive, {sentiment['neutral']}% neutral, {sentiment['negative']}% negative") # Sample messages from each cluster st.subheader("Sample Messages") for cluster_id in sorted(df_clustered['cluster'].unique()): with st.expander(f"Cluster {cluster_id} Messages"): cluster_msgs = df_clustered[df_clustered['cluster'] == cluster_id]['message'] sample_size = min(3, len(cluster_msgs)) if sample_size > 0: for msg in cluster_msgs.sample(sample_size, replace=False): st.write(f"- {msg}") else: st.write("No messages available") except Exception as e: st.error(f"Clustering failed: {str(e)}")