RAHULJUNEJA33 commited on
Commit
47eeab8
Β·
verified Β·
1 Parent(s): 0fdee13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -62
app.py CHANGED
@@ -3,37 +3,34 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  from nltk.tokenize import sent_tokenize
5
  from collections import defaultdict
6
- import fitz # PyMuPDF
7
  import re
8
- import nltk
9
  import os
 
10
 
11
- # Ensure NLTK data is stored in the correct directory
12
  NLTK_DATA_PATH = "/root/nltk_data"
13
  os.makedirs(NLTK_DATA_PATH, exist_ok=True)
14
  nltk.data.path.append(NLTK_DATA_PATH)
15
-
16
- # Download required resources
17
  nltk.download('punkt', download_dir=NLTK_DATA_PATH)
18
 
19
- # Streamlit app configuration
20
  st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
21
  st.title("πŸ“Š Financial Report Sentiment Analyzer")
22
 
23
  st.markdown("""
24
  ### What is FinBERT?
25
- **FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports.
26
- We analyze three key financial aspects:
27
  1. **Assets** – What the company owns
28
  2. **Liabilities** – What the company owes
29
  3. **Equity** – Net worth (Assets - Liabilities)
30
  ---
31
  """)
32
 
33
- # File uploader
34
  uploaded_file = st.file_uploader("πŸ“‚ Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
35
 
36
- # CSS Styling for Report Preview
37
  st.markdown("""
38
  <style>
39
  .report-preview {
@@ -42,7 +39,7 @@ st.markdown("""
42
  max-height: 300px;
43
  overflow-y: scroll;
44
  background-color: #f9f9f9;
45
- color: #333333 !important;
46
  white-space: pre-wrap;
47
  line-height: 1.6;
48
  font-family: Arial, sans-serif;
@@ -50,50 +47,49 @@ st.markdown("""
50
  </style>
51
  """, unsafe_allow_html=True)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  if uploaded_file:
54
- # Extract text from the uploaded file
55
- if uploaded_file.name.endswith('.pdf'):
56
- with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
57
- report_text = "".join([page.get_text() for page in doc])
58
- else:
59
- report_text = uploaded_file.read().decode('utf-8')
60
-
61
- # Display the uploaded report preview
62
  st.write("### πŸ“„ Uploaded Report Preview:")
63
- st.markdown(f'''
64
- <div class="report-preview">
65
- {report_text[:5000]}
66
- </div>
67
- ''', unsafe_allow_html=True)
68
-
69
- # Load FinBERT Model (cached for performance)
70
- @st.cache_resource
71
- def load_model():
72
- tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
73
- model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
74
- return tokenizer, model
75
-
76
- tokenizer, model = load_model()
77
- label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
78
-
79
- # Sentiment Analysis Function
80
  def analyze_sentiment(sentence):
81
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
82
  with torch.no_grad():
83
  outputs = model(**inputs)
84
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
85
  label_idx = torch.argmax(probs, dim=1).item()
86
- label = label_mapping[label_idx]
87
- return label, probs.tolist()[0]
88
 
89
- # Extract sentences using regex and NLTK tokenizer
90
  def extract_sentences(text, keywords):
91
  sentences = sent_tokenize(text)
92
- keywords_lower = [k.lower() for k in keywords]
93
- pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE)
94
  return [s for s in sentences if pattern.search(s)]
95
 
96
- # Analyze financial sentiment category-wise
97
  def analyze_category(text, category_name, keywords):
98
  sentences = extract_sentences(text, keywords)
99
  if not sentences:
@@ -111,34 +107,22 @@ if uploaded_file:
111
 
112
  total = sum(sentiment_scores.values())
113
  sentiment_percentages = {
114
- 'Positive': (sentiment_scores.get('Positive', 0) / total) * 100,
115
- 'Negative': (sentiment_scores.get('Negative', 0) / total) * 100,
116
- 'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100
117
  }
118
  return sentiment_percentages, negative_sentences
119
 
120
- # Define financial categories and keywords
121
  categories = {
122
- 'Assets': [
123
- 'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents',
124
- 'inventory', 'receivables', 'property', 'equipment', 'investments',
125
- 'prepaid expenses', 'securities', 'liquid assets', 'capital assets'
126
- ],
127
- 'Liabilities': [
128
- 'liability', 'liabilities', 'debt', 'accounts payable', 'accrued expenses',
129
- 'loans payable', 'bonds payable', 'mortgage', 'taxes payable', 'leases',
130
- 'borrowings', 'creditors', 'obligations', 'outstanding debt'
131
- ],
132
- 'Equity': [
133
- 'equity', 'shareholders equity', 'stockholders equity', 'common stock',
134
- 'preferred stock', 'retained earnings', 'treasury stock', 'paid-in capital',
135
- 'net worth', 'owner’s equity', 'share capital', 'accumulated deficit'
136
- ]
137
  }
138
 
 
139
  st.write("## πŸ“ Sentiment Analysis Results:")
140
 
141
- # Perform sentiment analysis for each financial category
142
  for category, keywords in categories.items():
143
  st.write(f"### πŸ” {category}")
144
  result = analyze_category(report_text, category, keywords)
@@ -147,11 +131,13 @@ if uploaded_file:
147
 
148
  sentiment_percentages, negative_sentences = result
149
 
 
150
  cols = st.columns(3)
151
  cols[0].metric(label="βœ… Positive", value=f"{sentiment_percentages['Positive']:.1f}%")
152
  cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%")
153
  cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%")
154
 
 
155
  if negative_sentences:
156
  with st.expander("πŸ”» View Negative Sentences"):
157
  for idx, (sentence, probs) in enumerate(negative_sentences, 1):
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  from nltk.tokenize import sent_tokenize
5
  from collections import defaultdict
6
+ import fitz # PyMuPDF for PDF reading
7
  import re
 
8
  import os
9
+ import nltk
10
 
11
+ # βœ… Fix NLTK Issue: Set Custom Download Path
12
  NLTK_DATA_PATH = "/root/nltk_data"
13
  os.makedirs(NLTK_DATA_PATH, exist_ok=True)
14
  nltk.data.path.append(NLTK_DATA_PATH)
 
 
15
  nltk.download('punkt', download_dir=NLTK_DATA_PATH)
16
 
17
+ # Streamlit App Configuration
18
  st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
19
  st.title("πŸ“Š Financial Report Sentiment Analyzer")
20
 
21
  st.markdown("""
22
  ### What is FinBERT?
23
+ **FinBERT** is a language model fine-tuned for financial text analysis. It classifies sentiment as **Positive, Neutral, or Negative** for key financial aspects:
 
24
  1. **Assets** – What the company owns
25
  2. **Liabilities** – What the company owes
26
  3. **Equity** – Net worth (Assets - Liabilities)
27
  ---
28
  """)
29
 
30
+ # File Upload
31
  uploaded_file = st.file_uploader("πŸ“‚ Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
32
 
33
+ # βœ… Custom CSS for Better Report Preview
34
  st.markdown("""
35
  <style>
36
  .report-preview {
 
39
  max-height: 300px;
40
  overflow-y: scroll;
41
  background-color: #f9f9f9;
42
+ color: #333 !important;
43
  white-space: pre-wrap;
44
  line-height: 1.6;
45
  font-family: Arial, sans-serif;
 
47
  </style>
48
  """, unsafe_allow_html=True)
49
 
50
+ # βœ… Load FinBERT Model (Optimized with Streamlit Caching)
51
+ @st.cache_resource
52
+ def load_model():
53
+ tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
54
+ model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
55
+ return tokenizer, model
56
+
57
+ tokenizer, model = load_model()
58
+ label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}
59
+
60
+ # βœ… Extract Text from Uploaded File
61
+ def extract_text(file):
62
+ try:
63
+ if file.name.endswith('.pdf'):
64
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
65
+ return "\n".join([page.get_text() for page in doc])
66
+ else:
67
+ return file.read().decode('utf-8')
68
+ except Exception as e:
69
+ st.error(f"❌ Error reading file: {e}")
70
+ return ""
71
+
72
  if uploaded_file:
73
+ report_text = extract_text(uploaded_file)
 
 
 
 
 
 
 
74
  st.write("### πŸ“„ Uploaded Report Preview:")
75
+ st.markdown(f"<div class='report-preview'>{report_text[:5000]}</div>", unsafe_allow_html=True)
76
+
77
+ # βœ… Sentiment Analysis Function
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def analyze_sentiment(sentence):
79
  inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
80
  with torch.no_grad():
81
  outputs = model(**inputs)
82
  probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
83
  label_idx = torch.argmax(probs, dim=1).item()
84
+ return label_mapping[label_idx], probs.tolist()[0]
 
85
 
86
+ # βœ… Extract Sentences Matching Financial Keywords
87
  def extract_sentences(text, keywords):
88
  sentences = sent_tokenize(text)
89
+ pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords)) + r')\b', re.IGNORECASE)
 
90
  return [s for s in sentences if pattern.search(s)]
91
 
92
+ # βœ… Analyze Sentiment for a Specific Financial Category
93
  def analyze_category(text, category_name, keywords):
94
  sentences = extract_sentences(text, keywords)
95
  if not sentences:
 
107
 
108
  total = sum(sentiment_scores.values())
109
  sentiment_percentages = {
110
+ 'Positive': (sentiment_scores.get('Positive', 0) / total) * 100 if total else 0,
111
+ 'Negative': (sentiment_scores.get('Negative', 0) / total) * 100 if total else 0,
112
+ 'Neutral': (sentiment_scores.get('Neutral', 0) / total) * 100 if total else 0
113
  }
114
  return sentiment_percentages, negative_sentences
115
 
116
+ # βœ… Financial Categories & Keywords
117
  categories = {
118
+ 'Assets': ['asset', 'current assets', 'fixed assets', 'cash equivalents', 'inventory', 'receivables', 'property', 'investments'],
119
+ 'Liabilities': ['liability', 'debt', 'accounts payable', 'loans payable', 'taxes payable', 'borrowings', 'creditors', 'obligations'],
120
+ 'Equity': ['equity', 'shareholders equity', 'stockholders equity', 'common stock', 'retained earnings', 'net worth', 'share capital']
 
 
 
 
 
 
 
 
 
 
 
 
121
  }
122
 
123
+ # βœ… Sentiment Analysis Results
124
  st.write("## πŸ“ Sentiment Analysis Results:")
125
 
 
126
  for category, keywords in categories.items():
127
  st.write(f"### πŸ” {category}")
128
  result = analyze_category(report_text, category, keywords)
 
131
 
132
  sentiment_percentages, negative_sentences = result
133
 
134
+ # Display Sentiment Metrics
135
  cols = st.columns(3)
136
  cols[0].metric(label="βœ… Positive", value=f"{sentiment_percentages['Positive']:.1f}%")
137
  cols[1].metric(label="⚠️ Negative", value=f"{sentiment_percentages['Negative']:.1f}%")
138
  cols[2].metric(label="ℹ️ Neutral", value=f"{sentiment_percentages['Neutral']:.1f}%")
139
 
140
+ # Show Negative Sentences (if any)
141
  if negative_sentences:
142
  with st.expander("πŸ”» View Negative Sentences"):
143
  for idx, (sentence, probs) in enumerate(negative_sentences, 1):