import gradio as gr import cv2 import easyocr import pandas as pd import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier # Download necessary NLTK data nltk.data.path.append("/usr/local/lib/nltk_data") nltk.download('punkt') nltk.download('stopwords') """ EasyOCR for Text Extraction """ def ocr_with_easy(img): # Convert image to grayscale gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) cv2.imwrite('image.png', gray_scale_image) # Use EasyOCR to read text from the image reader = easyocr.Reader(['en']) bounds = reader.readtext('image.png', paragraph="False", detail=0) extracted_text = ' '.join(bounds) return extracted_text """ Text Preprocessing for Spam Classification """ def preprocess_text(text): tokens = word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words] stemmer = PorterStemmer() stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] return ' '.join(stemmed_tokens) """ Load and Train Spam Classifier """ # Load the dataset data = pd.read_csv('spam.csv', encoding='latin-1') data['v2'] = data['v2'].apply(preprocess_text) # Feature Extraction (TF-IDF) tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2']) # Label Encoding data['v1'] = data['v1'].map({'ham': 0, 'spam': 1}) # Create a Random Forest classifier rf_classifier = RandomForestClassifier(random_state=42) rf_classifier.fit(tfidf_matrix, data['v1']) """ OCR and Spam Classification Pipeline """ def ocr_and_classify_spam(img): # Step 1: Extract text from the image using EasyOCR extracted_text = ocr_with_easy(img) # Step 2: Preprocess and classify the extracted text if extracted_text: processed_text = preprocess_text(extracted_text) input_tfidf = tfidf_vectorizer.transform([processed_text]) prediction = rf_classifier.predict(input_tfidf) spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM" else: spam_result = "No text found in the image." return extracted_text, spam_result """ Create User Interface with Gradio """ image = gr.Image() output_text = gr.Textbox(label="Extracted Text") output_classification = gr.Textbox(label="Spam Classification") demo = gr.Interface( fn=ocr_and_classify_spam, inputs=image, outputs=[output_text, output_classification], title="OCR and Spam Classifier", description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.", css=".gradio-container {background-color: lightgray}" ) demo.launch()