Spaces:

RAHULJUNEJA33
/

Financial_Report_Sentiment_Analyzer

Sleeping

App Files Files Community

RAHULJUNEJA33 commited on Mar 17

Commit

47eeab8

verified ·

1 Parent(s): 0fdee13

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -62

app.py CHANGED Viewed

@@ -3,37 +3,34 @@ import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from nltk.tokenize import sent_tokenize
 from collections import defaultdict
-import fitz  # PyMuPDF
 import re
-import nltk
 import os
-# Ensure NLTK data is stored in the correct directory
 NLTK_DATA_PATH = "/root/nltk_data"
 os.makedirs(NLTK_DATA_PATH, exist_ok=True)
 nltk.data.path.append(NLTK_DATA_PATH)
-# Download required resources
 nltk.download('punkt', download_dir=NLTK_DATA_PATH)
-# Streamlit app configuration
 st.set_page_config(page_title="📊 Financial Report Sentiment Analyzer", layout="wide")
 st.title("📊 Financial Report Sentiment Analyzer")
 st.markdown("""
 ### What is FinBERT?
-**FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports.
-We analyze three key financial aspects:
 1. **Assets** – What the company owns
 2. **Liabilities** – What the company owes
 3. **Equity** – Net worth (Assets - Liabilities)
 ---
 """)
-# File uploader
 uploaded_file = st.file_uploader("📂 Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
-# CSS Styling for Report Preview
 st.markdown("""
 <style>
 .report-preview {
@@ -42,7 +39,7 @@ st.markdown("""
     max-height: 300px;
     overflow-y: scroll;
     background-color: #f9f9f9;
-    color: #333333 !important;
     white-space: pre-wrap;
     line-height: 1.6;
     font-family: Arial, sans-serif;
@@ -50,50 +47,49 @@ st.markdown("""
 </style>
 """, unsafe_allow_html=True)
 if uploaded_file:
-    # Extract text from the uploaded file
-    if uploaded_file.name.endswith('.pdf'):
-        with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
-            report_text = "".join([page.get_text() for page in doc])
-    else:
-        report_text = uploaded_file.read().decode('utf-8')
-    # Display the uploaded report preview
     st.write("### 📄 Uploaded Report Preview:")
-    st.markdown(f'''
-    <div class="report-preview">
-        {report_text[:5000]}
-    </div>
-    ''', unsafe_allow_html=True)
-    # Load FinBERT Model (cached for performance)
-    @st.cache_resource
-    def load_model():
-        tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
-        model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
-        return tokenizer, model
-    tokenizer, model = load_model()
-    label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
-    # Sentiment Analysis Function
     def analyze_sentiment(sentence):
         inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model(**inputs)
         probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
         label_idx = torch.argmax(probs, dim=1).item()
-        label = label_mapping[label_idx]
-        return label, probs.tolist()[0]
-    # Extract sentences using regex and NLTK tokenizer
     def extract_sentences(text, keywords):
         sentences = sent_tokenize(text)
-        keywords_lower = [k.lower() for k in keywords]
-        pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE)
         return [s for s in sentences if pattern.search(s)]
-    # Analyze financial sentiment category-wise
     def analyze_category(text, category_name, keywords):
         sentences = extract_sentences(text, keywords)
         if not sentences:
@@ -111,34 +107,22 @@ if uploaded_file:
         total = sum(sentiment_scores.values())
         sentiment_percentages = {
-            'Positive': (sentiment_scores.get('Positive', 0) / total) * 100,
-            'Negative': (sentiment_scores.get('Negative', 0) / total) * 100,
-            'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100
         }
         return sentiment_percentages, negative_sentences
-    # Define financial categories and keywords
     categories = {
-        'Assets': [
-            'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents',
-            'inventory', 'receivables', 'property', 'equipment', 'investments',
-            'prepaid expenses', 'securities', 'liquid assets', 'capital assets'
-        ],
-        'Liabilities': [
-            'liability', 'liabilities', 'debt', 'accounts payable', 'accrued expenses',
-            'loans payable', 'bonds payable', 'mortgage', 'taxes payable', 'leases',
-            'borrowings', 'creditors', 'obligations', 'outstanding debt'
-        ],
-        'Equity': [
-            'equity', 'shareholders equity', 'stockholders equity', 'common stock',
-            'preferred stock', 'retained earnings', 'treasury stock', 'paid-in capital',
-            'net worth', 'owner’s equity', 'share capital', 'accumulated deficit'
-        ]
     }
     st.write("## 📝 Sentiment Analysis Results:")
-    # Perform sentiment analysis for each financial category
     for category, keywords in categories.items():
         st.write(f"### 🔍 {category}")
         result = analyze_category(report_text, category, keywords)
@@ -147,11 +131,13 @@ if uploaded_file:
         sentiment_percentages, negative_sentences = result
         cols = st.columns(3)
         cols[0].metric(label="✅ Positive", value=f"{sentiment_percentages['Positive']:.1f}%")
         cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%")
         cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%")
         if negative_sentences:
             with st.expander("🔻 View Negative Sentences"):
                 for idx, (sentence, probs) in enumerate(negative_sentences, 1):

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from nltk.tokenize import sent_tokenize
 from collections import defaultdict
+import fitz  # PyMuPDF for PDF reading
 import re
 import os
+import nltk
+# ✅ Fix NLTK Issue: Set Custom Download Path
 NLTK_DATA_PATH = "/root/nltk_data"
 os.makedirs(NLTK_DATA_PATH, exist_ok=True)
 nltk.data.path.append(NLTK_DATA_PATH)
 nltk.download('punkt', download_dir=NLTK_DATA_PATH)
+# Streamlit App Configuration
 st.set_page_config(page_title="📊 Financial Report Sentiment Analyzer", layout="wide")
 st.title("📊 Financial Report Sentiment Analyzer")
 st.markdown("""
 ### What is FinBERT?
+**FinBERT** is a language model fine-tuned for financial text analysis. It classifies sentiment as **Positive, Neutral, or Negative** for key financial aspects:
 1. **Assets** – What the company owns
 2. **Liabilities** – What the company owes
 3. **Equity** – Net worth (Assets - Liabilities)
 ---
 """)
+# File Upload
 uploaded_file = st.file_uploader("📂 Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
+# ✅ Custom CSS for Better Report Preview
 st.markdown("""
 <style>
 .report-preview {
     max-height: 300px;
     overflow-y: scroll;
     background-color: #f9f9f9;
+    color: #333 !important;
     white-space: pre-wrap;
     line-height: 1.6;
     font-family: Arial, sans-serif;
 </style>
 """, unsafe_allow_html=True)
+# ✅ Load FinBERT Model (Optimized with Streamlit Caching)
+@st.cache_resource
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
+    model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
+    return tokenizer, model
+tokenizer, model = load_model()
+label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
+# ✅ Extract Text from Uploaded File
+def extract_text(file):
+    try:
+        if file.name.endswith('.pdf'):
+            with fitz.open(stream=file.read(), filetype="pdf") as doc:
+                return "\n".join([page.get_text() for page in doc])
+        else:
+            return file.read().decode('utf-8')
+    except Exception as e:
+        st.error(f"❌ Error reading file: {e}")
+        return ""
 if uploaded_file:
+    report_text = extract_text(uploaded_file)
     st.write("### 📄 Uploaded Report Preview:")
+    st.markdown(f"<div class='report-preview'>{report_text[:5000]}</div>", unsafe_allow_html=True)
+    # ✅ Sentiment Analysis Function
     def analyze_sentiment(sentence):
         inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
         with torch.no_grad():
             outputs = model(**inputs)
         probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
         label_idx = torch.argmax(probs, dim=1).item()
+        return label_mapping[label_idx], probs.tolist()[0]
+    # ✅ Extract Sentences Matching Financial Keywords
     def extract_sentences(text, keywords):
         sentences = sent_tokenize(text)
+        pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
         return [s for s in sentences if pattern.search(s)]
+    # ✅ Analyze Sentiment for a Specific Financial Category
     def analyze_category(text, category_name, keywords):
         sentences = extract_sentences(text, keywords)
         if not sentences:
         total = sum(sentiment_scores.values())
         sentiment_percentages = {
+            'Positive': (sentiment_scores.get('Positive', 0) / total) * 100 if total else 0,
+            'Negative': (sentiment_scores.get('Negative', 0) / total) * 100 if total else 0,
+            'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100 if total else 0
         }
         return sentiment_percentages, negative_sentences
+    # ✅ Financial Categories & Keywords
     categories = {
+        'Assets': ['asset', 'current assets', 'fixed assets', 'cash equivalents', 'inventory', 'receivables', 'property', 'investments'],
+        'Liabilities': ['liability', 'debt', 'accounts payable', 'loans payable', 'taxes payable', 'borrowings', 'creditors', 'obligations'],
+        'Equity': ['equity', 'shareholders equity', 'stockholders equity', 'common stock', 'retained earnings', 'net worth', 'share capital']
     }
+    # ✅ Sentiment Analysis Results
     st.write("## 📝 Sentiment Analysis Results:")
     for category, keywords in categories.items():
         st.write(f"### 🔍 {category}")
         result = analyze_category(report_text, category, keywords)
         sentiment_percentages, negative_sentences = result
+        # Display Sentiment Metrics
         cols = st.columns(3)
         cols[0].metric(label="✅ Positive", value=f"{sentiment_percentages['Positive']:.1f}%")
         cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%")
         cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%")
+        # Show Negative Sentences (if any)
         if negative_sentences:
             with st.expander("🔻 View Negative Sentences"):
                 for idx, (sentence, probs) in enumerate(negative_sentences, 1):