ocr-llm-test / app.py
winamnd's picture
Update app.py
9c1923d verified
raw
history blame
2.9 kB
import gradio as gr
import cv2
import easyocr
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
# Download necessary NLTK data
nltk.data.path.append("/usr/local/lib/nltk_data")
nltk.download('punkt')
nltk.download('stopwords')
"""
EasyOCR for Text Extraction
"""
def ocr_with_easy(img):
# Convert image to grayscale
gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite('image.png', gray_scale_image)
# Use EasyOCR to read text from the image
reader = easyocr.Reader(['en'])
bounds = reader.readtext('image.png', paragraph="False", detail=0)
extracted_text = ' '.join(bounds)
return extracted_text
"""
Text Preprocessing for Spam Classification
"""
def preprocess_text(text):
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
return ' '.join(stemmed_tokens)
"""
Load and Train Spam Classifier
"""
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data['v2'] = data['v2'].apply(preprocess_text)
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])
# Label Encoding
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(tfidf_matrix, data['v1'])
"""
OCR and Spam Classification Pipeline
"""
def ocr_and_classify_spam(img):
# Step 1: Extract text from the image using EasyOCR
extracted_text = ocr_with_easy(img)
# Step 2: Preprocess and classify the extracted text
if extracted_text:
processed_text = preprocess_text(extracted_text)
input_tfidf = tfidf_vectorizer.transform([processed_text])
prediction = rf_classifier.predict(input_tfidf)
spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
else:
spam_result = "No text found in the image."
return extracted_text, spam_result
"""
Create User Interface with Gradio
"""
image = gr.Image()
output_text = gr.Textbox(label="Extracted Text")
output_classification = gr.Textbox(label="Spam Classification")
demo = gr.Interface(
fn=ocr_and_classify_spam,
inputs=image,
outputs=[output_text, output_classification],
title="OCR and Spam Classifier",
description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
css=".gradio-container {background-color: lightgray}"
)
demo.launch()