tesseract-multi-ocr / app_blocks.py
pvanand's picture
Update app_blocks.py
9ad522d verified
raw
history blame
3.36 kB
from typing import List
import os
import sys
import subprocess
from pathlib import Path
import pytesseract
from PIL import Image
import gradio as gr
def check_tesseract_installed():
try:
subprocess.run(['tesseract', '--version'], capture_output=True, check=True)
return True
except (subprocess.CalledProcessError, FileNotFoundError):
return False
def setup_tesseract():
# Check if Tesseract is installed
if not check_tesseract_installed():
print("Tesseract is not installed. Please install it using:")
print("sudo apt-get update && sudo apt-get install -y tesseract-ocr")
print("For additional languages: sudo apt-get install -y tesseract-ocr-all")
sys.exit(1)
# Set TESSDATA_PREFIX if not already set
if 'TESSDATA_PREFIX' not in os.environ:
# Common locations for tessdata
tessdata_paths = [
'/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions
'/usr/share/tesseract-ocr/tessdata', # Older versions
'/usr/share/tessdata', # Alternative location
]
for path in tessdata_paths:
if Path(path).exists():
os.environ['TESSDATA_PREFIX'] = path
print(f"Set TESSDATA_PREFIX to {path}")
break
else:
print("Warning: Could not find tessdata directory")
print("Please install language data files or set TESSDATA_PREFIX manually")
def tesseract_ocr(filepath: str, languages: List[str]=None):
if languages is None:
languages = ['eng'] # Default to English if no language specified
image = Image.open(filepath)
return pytesseract.image_to_string(image=image, lang='+'.join(languages))
title = "Tesseract OCR"
description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
article = "<p style='text-align: center'><a href='https://tesseract-ocr.github.io/' target='_blank'>Tesseract documentation</a> | <a href='https://github.com/tesseract-ocr/tesseract' target='_blank'>Github Repo</a></p>"
examples = [
["examples/weird_unicode_math_symbols.png", []],
["examples/eurotext.png", ["eng"]],
["examples/tesseract_sample.png", ["jpn", "eng"]],
["examples/chi.jpg", ["HanS", "HanT"]],
]
with gr.Blocks(title=title) as demo:
gr.Markdown(f'<h1 style="text-align: center; margin-bottom: 1rem;">{title}</h1>')
gr.Markdown(description)
with gr.Row():
with gr.Column():
image = gr.Image(type="filepath", label="Input")
language_choices = pytesseract.get_languages()
with gr.Accordion("Languages", open=False):
languages = gr.CheckboxGroup(language_choices, type="value", value=["eng"], label='language')
with gr.Row():
btn_clear = gr.ClearButton([image, languages])
btn_submit = gr.Button(value="Submit", variant="primary")
with gr.Column():
text = gr.Textbox(label="Output")
btn_submit.click(tesseract_ocr, inputs=[image, languages], outputs=text, api_name="tesseract-ocr")
btn_clear.add(text)
gr.Examples(
examples=examples,
inputs=[image, languages],
)
gr.Markdown(article)
if __name__ == '__main__':
setup_tesseract()
demo.launch()