from typing import List import os import sys import subprocess from pathlib import Path import pytesseract from PIL import Image import gradio as gr def check_tesseract_installed(): try: subprocess.run(['tesseract', '--version'], capture_output=True, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): return False def setup_tesseract(): # Check if Tesseract is installed if not check_tesseract_installed(): print("Tesseract is not installed. Please install it using:") print("sudo apt-get update && sudo apt-get install -y tesseract-ocr") print("For additional languages: sudo apt-get install -y tesseract-ocr-all") sys.exit(1) # Set TESSDATA_PREFIX if not already set if 'TESSDATA_PREFIX' not in os.environ: # Common locations for tessdata tessdata_paths = [ '/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions '/usr/share/tesseract-ocr/tessdata', # Older versions '/usr/share/tessdata', # Alternative location ] for path in tessdata_paths: if Path(path).exists(): os.environ['TESSDATA_PREFIX'] = path print(f"Set TESSDATA_PREFIX to {path}") break else: print("Warning: Could not find tessdata directory") print("Please install language data files or set TESSDATA_PREFIX manually") def tesseract_ocr(filepath: str, languages: List[str]=None): if languages is None: languages = ['eng'] # Default to English if no language specified image = Image.open(filepath) return pytesseract.image_to_string(image=image, lang='+'.join(languages)) title = "Tesseract OCR" description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine." article = "
Tesseract documentation | Github Repo
" examples = [ ["examples/weird_unicode_math_symbols.png", []], ["examples/eurotext.png", ["eng"]], ["examples/tesseract_sample.png", ["jpn", "eng"]], ["examples/chi.jpg", ["HanS", "HanT"]], ] with gr.Blocks(title=title) as demo: gr.Markdown(f'