from typing import List import os import sys import subprocess from pathlib import Path import pytesseract from PIL import Image import gradio as gr def check_tesseract_installed(): try: subprocess.run(['tesseract', '--version'], capture_output=True, check=True) return True except (subprocess.CalledProcessError, FileNotFoundError): return False def setup_tesseract(): # Check if Tesseract is installed if not check_tesseract_installed(): print("Tesseract is not installed. Please install it using:") print("sudo apt-get update && sudo apt-get install -y tesseract-ocr") print("For additional languages: sudo apt-get install -y tesseract-ocr-all") sys.exit(1) # Set TESSDATA_PREFIX if not already set if 'TESSDATA_PREFIX' not in os.environ: # Common locations for tessdata tessdata_paths = [ '/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions '/usr/share/tesseract-ocr/tessdata', # Older versions '/usr/share/tessdata', # Alternative location ] for path in tessdata_paths: if Path(path).exists(): os.environ['TESSDATA_PREFIX'] = path print(f"Set TESSDATA_PREFIX to {path}") break else: print("Warning: Could not find tessdata directory") print("Please install language data files or set TESSDATA_PREFIX manually") def tesseract_ocr(filepath: str, languages: List[str]=None): if languages is None: languages = ['eng'] # Default to English if no language specified image = Image.open(filepath) return pytesseract.image_to_string(image=image, lang='+'.join(languages)) title = "Tesseract OCR" description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine." article = "

Tesseract documentation | Github Repo

" examples = [ ["examples/weird_unicode_math_symbols.png", []], ["examples/eurotext.png", ["eng"]], ["examples/tesseract_sample.png", ["jpn", "eng"]], ["examples/chi.jpg", ["HanS", "HanT"]], ] with gr.Blocks(title=title) as demo: gr.Markdown(f'

{title}

') gr.Markdown(description) with gr.Row(): with gr.Column(): image = gr.Image(type="filepath", label="Input") language_choices = pytesseract.get_languages() with gr.Accordion("Languages", open=False): languages = gr.CheckboxGroup(language_choices, type="value", value=["eng"], label='language') with gr.Row(): btn_clear = gr.ClearButton([image, languages]) btn_submit = gr.Button(value="Submit", variant="primary") with gr.Column(): text = gr.Textbox(label="Output") btn_submit.click(tesseract_ocr, inputs=[image, languages], outputs=text, api_name="tesseract-ocr") btn_clear.add(text) gr.Examples( examples=examples, inputs=[image, languages], ) gr.Markdown(article) if __name__ == '__main__': setup_tesseract() demo.launch()