pvanand commited on
Commit
9ad522d
·
verified ·
1 Parent(s): c8cead5

Update app_blocks.py

Browse files
Files changed (1) hide show
  1. app_blocks.py +41 -1
app_blocks.py CHANGED
@@ -1,13 +1,52 @@
1
  from typing import List
 
 
 
 
2
 
3
  import pytesseract
4
  from PIL import Image
5
 
6
  import gradio as gr
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def tesseract_ocr(filepath: str, languages: List[str]=None):
 
 
9
  image = Image.open(filepath)
10
- return pytesseract.image_to_string(image=image, lang=', '.join(languages) if languages else None)
11
 
12
  title = "Tesseract OCR"
13
  description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
@@ -45,4 +84,5 @@ with gr.Blocks(title=title) as demo:
45
  gr.Markdown(article)
46
 
47
  if __name__ == '__main__':
 
48
  demo.launch()
 
1
  from typing import List
2
+ import os
3
+ import sys
4
+ import subprocess
5
+ from pathlib import Path
6
 
7
  import pytesseract
8
  from PIL import Image
9
 
10
  import gradio as gr
11
 
12
+ def check_tesseract_installed():
13
+ try:
14
+ subprocess.run(['tesseract', '--version'], capture_output=True, check=True)
15
+ return True
16
+ except (subprocess.CalledProcessError, FileNotFoundError):
17
+ return False
18
+
19
+ def setup_tesseract():
20
+ # Check if Tesseract is installed
21
+ if not check_tesseract_installed():
22
+ print("Tesseract is not installed. Please install it using:")
23
+ print("sudo apt-get update && sudo apt-get install -y tesseract-ocr")
24
+ print("For additional languages: sudo apt-get install -y tesseract-ocr-all")
25
+ sys.exit(1)
26
+
27
+ # Set TESSDATA_PREFIX if not already set
28
+ if 'TESSDATA_PREFIX' not in os.environ:
29
+ # Common locations for tessdata
30
+ tessdata_paths = [
31
+ '/usr/share/tesseract-ocr/4.00/tessdata', # Newer versions
32
+ '/usr/share/tesseract-ocr/tessdata', # Older versions
33
+ '/usr/share/tessdata', # Alternative location
34
+ ]
35
+
36
+ for path in tessdata_paths:
37
+ if Path(path).exists():
38
+ os.environ['TESSDATA_PREFIX'] = path
39
+ print(f"Set TESSDATA_PREFIX to {path}")
40
+ break
41
+ else:
42
+ print("Warning: Could not find tessdata directory")
43
+ print("Please install language data files or set TESSDATA_PREFIX manually")
44
+
45
  def tesseract_ocr(filepath: str, languages: List[str]=None):
46
+ if languages is None:
47
+ languages = ['eng'] # Default to English if no language specified
48
  image = Image.open(filepath)
49
+ return pytesseract.image_to_string(image=image, lang='+'.join(languages))
50
 
51
  title = "Tesseract OCR"
52
  description = "Gradio demo for Tesseract. Tesseract is an open source text recognition (OCR) Engine."
 
84
  gr.Markdown(article)
85
 
86
  if __name__ == '__main__':
87
+ setup_tesseract()
88
  demo.launch()