RAHULJUNEJA33 commited on
Commit
0fdee13
Β·
verified Β·
1 Parent(s): b880bb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -13
app.py CHANGED
@@ -3,21 +3,26 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  from nltk.tokenize import sent_tokenize
5
  from collections import defaultdict
6
- import nltk
7
  import fitz # PyMuPDF
8
  import re
 
 
9
 
10
- # Download NLTK tokenizer
11
- nltk.download('punkt')
 
 
12
 
13
- st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
 
14
 
 
 
15
  st.title("πŸ“Š Financial Report Sentiment Analyzer")
16
 
17
  st.markdown("""
18
  ### What is FinBERT?
19
  **FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports.
20
-
21
  We analyze three key financial aspects:
22
  1. **Assets** – What the company owns
23
  2. **Liabilities** – What the company owes
@@ -25,8 +30,10 @@ We analyze three key financial aspects:
25
  ---
26
  """)
27
 
 
28
  uploaded_file = st.file_uploader("πŸ“‚ Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
29
 
 
30
  st.markdown("""
31
  <style>
32
  .report-preview {
@@ -44,13 +51,14 @@ st.markdown("""
44
  """, unsafe_allow_html=True)
45
 
46
  if uploaded_file:
47
- # Extract text from uploaded file
48
  if uploaded_file.name.endswith('.pdf'):
49
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
50
- report_text = "\n".join([page.get_text() for page in doc])
51
  else:
52
  report_text = uploaded_file.read().decode('utf-8')
53
 
 
54
  st.write("### πŸ“„ Uploaded Report Preview:")
55
  st.markdown(f'''
56
  <div class="report-preview">
@@ -58,7 +66,7 @@ if uploaded_file:
58
  </div>
59
  ''', unsafe_allow_html=True)
60
 
61
- # Load FinBERT Model
62
  @st.cache_resource
63
  def load_model():
64
  tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
@@ -78,19 +86,20 @@ if uploaded_file:
78
  label = label_mapping[label_idx]
79
  return label, probs.tolist()[0]
80
 
81
- # Extract sentences based on financial keywords
82
  def extract_sentences(text, keywords):
83
  sentences = sent_tokenize(text)
84
  keywords_lower = [k.lower() for k in keywords]
85
  pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE)
86
  return [s for s in sentences if pattern.search(s)]
87
 
 
88
  def analyze_category(text, category_name, keywords):
89
  sentences = extract_sentences(text, keywords)
90
  if not sentences:
91
  st.warning(f"⚠️ No relevant sentences found for {category_name}")
92
  return None, []
93
-
94
  sentiment_scores = defaultdict(int)
95
  negative_sentences = []
96
 
@@ -100,7 +109,7 @@ if uploaded_file:
100
  if label == 'Negative':
101
  negative_sentences.append((sentence, probs))
102
 
103
- total = sum(sentiment_scores.values()) or 1 # Avoid division by zero
104
  sentiment_percentages = {
105
  'Positive': (sentiment_scores.get('Positive', 0) / total) * 100,
106
  'Negative': (sentiment_scores.get('Negative', 0) / total) * 100,
@@ -108,7 +117,7 @@ if uploaded_file:
108
  }
109
  return sentiment_percentages, negative_sentences
110
 
111
- # Expanded financial categories
112
  categories = {
113
  'Assets': [
114
  'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents',
@@ -129,12 +138,13 @@ if uploaded_file:
129
 
130
  st.write("## πŸ“ Sentiment Analysis Results:")
131
 
 
132
  for category, keywords in categories.items():
133
  st.write(f"### πŸ” {category}")
134
  result = analyze_category(report_text, category, keywords)
135
  if result[0] is None:
136
  continue
137
-
138
  sentiment_percentages, negative_sentences = result
139
 
140
  cols = st.columns(3)
 
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  from nltk.tokenize import sent_tokenize
5
  from collections import defaultdict
 
6
  import fitz # PyMuPDF
7
  import re
8
+ import nltk
9
+ import os
10
 
11
+ # Ensure NLTK data is stored in the correct directory
12
+ NLTK_DATA_PATH = "/root/nltk_data"
13
+ os.makedirs(NLTK_DATA_PATH, exist_ok=True)
14
+ nltk.data.path.append(NLTK_DATA_PATH)
15
 
16
+ # Download required resources
17
+ nltk.download('punkt', download_dir=NLTK_DATA_PATH)
18
 
19
+ # Streamlit app configuration
20
+ st.set_page_config(page_title="πŸ“Š Financial Report Sentiment Analyzer", layout="wide")
21
  st.title("πŸ“Š Financial Report Sentiment Analyzer")
22
 
23
  st.markdown("""
24
  ### What is FinBERT?
25
  **FinBERT** is a language model fine-tuned specifically for financial text. It helps in detecting sentiment (Positive, Neutral, Negative) in financial reports.
 
26
  We analyze three key financial aspects:
27
  1. **Assets** – What the company owns
28
  2. **Liabilities** – What the company owes
 
30
  ---
31
  """)
32
 
33
+ # File uploader
34
  uploaded_file = st.file_uploader("πŸ“‚ Upload Financial Report (.pdf or .txt)", type=["pdf", "txt"])
35
 
36
+ # CSS Styling for Report Preview
37
  st.markdown("""
38
  <style>
39
  .report-preview {
 
51
  """, unsafe_allow_html=True)
52
 
53
  if uploaded_file:
54
+ # Extract text from the uploaded file
55
  if uploaded_file.name.endswith('.pdf'):
56
  with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
57
+ report_text = "".join([page.get_text() for page in doc])
58
  else:
59
  report_text = uploaded_file.read().decode('utf-8')
60
 
61
+ # Display the uploaded report preview
62
  st.write("### πŸ“„ Uploaded Report Preview:")
63
  st.markdown(f'''
64
  <div class="report-preview">
 
66
  </div>
67
  ''', unsafe_allow_html=True)
68
 
69
+ # Load FinBERT Model (cached for performance)
70
  @st.cache_resource
71
  def load_model():
72
  tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
 
86
  label = label_mapping[label_idx]
87
  return label, probs.tolist()[0]
88
 
89
+ # Extract sentences using regex and NLTK tokenizer
90
  def extract_sentences(text, keywords):
91
  sentences = sent_tokenize(text)
92
  keywords_lower = [k.lower() for k in keywords]
93
  pattern = re.compile(r'\b(' + '|'.join(map(re.escape, keywords_lower)) + r')\b', re.IGNORECASE)
94
  return [s for s in sentences if pattern.search(s)]
95
 
96
+ # Analyze financial sentiment category-wise
97
  def analyze_category(text, category_name, keywords):
98
  sentences = extract_sentences(text, keywords)
99
  if not sentences:
100
  st.warning(f"⚠️ No relevant sentences found for {category_name}")
101
  return None, []
102
+
103
  sentiment_scores = defaultdict(int)
104
  negative_sentences = []
105
 
 
109
  if label == 'Negative':
110
  negative_sentences.append((sentence, probs))
111
 
112
+ total = sum(sentiment_scores.values())
113
  sentiment_percentages = {
114
  'Positive': (sentiment_scores.get('Positive', 0) / total) * 100,
115
  'Negative': (sentiment_scores.get('Negative', 0) / total) * 100,
 
117
  }
118
  return sentiment_percentages, negative_sentences
119
 
120
+ # Define financial categories and keywords
121
  categories = {
122
  'Assets': [
123
  'asset', 'assets', 'current assets', 'fixed assets', 'cash equivalents',
 
138
 
139
  st.write("## πŸ“ Sentiment Analysis Results:")
140
 
141
+ # Perform sentiment analysis for each financial category
142
  for category, keywords in categories.items():
143
  st.write(f"### πŸ” {category}")
144
  result = analyze_category(report_text, category, keywords)
145
  if result[0] is None:
146
  continue
147
+
148
  sentiment_percentages, negative_sentences = result
149
 
150
  cols = st.columns(3)