Spaces:

winamnd
/

ocr-llm-test

Running

App Files Files Community

ocr-llm-test / app.py

winamnd

Update app.py

9c1923d verified 4 months ago

raw

history blame

2.9 kB

	import gradio as gr
	import cv2
	import easyocr
	import pandas as pd
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestClassifier

	# Download necessary NLTK data
	nltk.data.path.append("/usr/local/lib/nltk_data")
	nltk.download('punkt')
	nltk.download('stopwords')

	"""
	EasyOCR for Text Extraction
	"""
	def ocr_with_easy(img):
	# Convert image to grayscale
	gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	cv2.imwrite('image.png', gray_scale_image)

	# Use EasyOCR to read text from the image
	reader = easyocr.Reader(['en'])
	bounds = reader.readtext('image.png', paragraph="False", detail=0)
	extracted_text = ' '.join(bounds)
	return extracted_text

	"""
	Text Preprocessing for Spam Classification
	"""
	def preprocess_text(text):
	tokens = word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
	stemmer = PorterStemmer()
	stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
	return ' '.join(stemmed_tokens)

	"""
	Load and Train Spam Classifier
	"""
	# Load the dataset
	data = pd.read_csv('spam.csv', encoding='latin-1')
	data['v2'] = data['v2'].apply(preprocess_text)

	# Feature Extraction (TF-IDF)
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])

	# Label Encoding
	data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})

	# Create a Random Forest classifier
	rf_classifier = RandomForestClassifier(random_state=42)
	rf_classifier.fit(tfidf_matrix, data['v1'])

	"""
	OCR and Spam Classification Pipeline
	"""
	def ocr_and_classify_spam(img):
	# Step 1: Extract text from the image using EasyOCR
	extracted_text = ocr_with_easy(img)

	# Step 2: Preprocess and classify the extracted text
	if extracted_text:
	processed_text = preprocess_text(extracted_text)
	input_tfidf = tfidf_vectorizer.transform([processed_text])
	prediction = rf_classifier.predict(input_tfidf)
	spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
	else:
	spam_result = "No text found in the image."

	return extracted_text, spam_result

	"""
	Create User Interface with Gradio
	"""
	image = gr.Image()
	output_text = gr.Textbox(label="Extracted Text")
	output_classification = gr.Textbox(label="Spam Classification")

	demo = gr.Interface(
	fn=ocr_and_classify_spam,
	inputs=image,
	outputs=[output_text, output_classification],
	title="OCR and Spam Classifier",
	description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
	css=".gradio-container {background-color: lightgray}"
	)

	demo.launch()