Spaces:
Sleeping
Sleeping
Update app_blocks.py
Browse files- app_blocks.py +41 -1
app_blocks.py
CHANGED
@@ -1,13 +1,52 @@
|
|
1 |
from typing import List
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import pytesseract
|
4 |
from PIL import Image
|
5 |
|
6 |
import gradio as gr
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def tesseract_ocr(filepath: str, languages: List[str]=None):
|
|
|
|
|
9 |
image = Image.open(filepath)
|
10 |
-
return pytesseract.image_to_string(image=image, lang='
|
11 |
|
12 |
title = "Tesseract OCR"
|
13 |
description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
|
@@ -45,4 +84,5 @@ with gr.Blocks(title=title) as demo:
|
|
45 |
gr.Markdown(article)
|
46 |
|
47 |
if __name__ == '__main__':
|
|
|
48 |
demo.launch()
|
|
|
1 |
from typing import List
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import subprocess
|
5 |
+
from pathlib import Path
|
6 |
|
7 |
import pytesseract
|
8 |
from PIL import Image
|
9 |
|
10 |
import gradio as gr
|
11 |
|
12 |
+
def check_tesseract_installed():
|
13 |
+
try:
|
14 |
+
subprocess.run(['tesseract', '--version'], capture_output=True, check=True)
|
15 |
+
return True
|
16 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
17 |
+
return False
|
18 |
+
|
19 |
+
def setup_tesseract():
|
20 |
+
# Check if Tesseract is installed
|
21 |
+
if not check_tesseract_installed():
|
22 |
+
print("Tesseract is not installed. Please install it using:")
|
23 |
+
print("sudo apt-get update && sudo apt-get install -y tesseract-ocr")
|
24 |
+
print("For additional languages: sudo apt-get install -y tesseract-ocr-all")
|
25 |
+
sys.exit(1)
|
26 |
+
|
27 |
+
# Set TESSDATA_PREFIX if not already set
|
28 |
+
if 'TESSDATA_PREFIX' not in os.environ:
|
29 |
+
# Common locations for tessdata
|
30 |
+
tessdata_paths = [
|
31 |
+
'/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions
|
32 |
+
'/usr/share/tesseract-ocr/tessdata', # Older versions
|
33 |
+
'/usr/share/tessdata', # Alternative location
|
34 |
+
]
|
35 |
+
|
36 |
+
for path in tessdata_paths:
|
37 |
+
if Path(path).exists():
|
38 |
+
os.environ['TESSDATA_PREFIX'] = path
|
39 |
+
print(f"Set TESSDATA_PREFIX to {path}")
|
40 |
+
break
|
41 |
+
else:
|
42 |
+
print("Warning: Could not find tessdata directory")
|
43 |
+
print("Please install language data files or set TESSDATA_PREFIX manually")
|
44 |
+
|
45 |
def tesseract_ocr(filepath: str, languages: List[str]=None):
|
46 |
+
if languages is None:
|
47 |
+
languages = ['eng'] # Default to English if no language specified
|
48 |
image = Image.open(filepath)
|
49 |
+
return pytesseract.image_to_string(image=image, lang='+'.join(languages))
|
50 |
|
51 |
title = "Tesseract OCR"
|
52 |
description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
|
|
|
84 |
gr.Markdown(article)
|
85 |
|
86 |
if __name__ == '__main__':
|
87 |
+
setup_tesseract()
|
88 |
demo.launch()
|