Spaces:

microsoft
/

StoriesComeAlive

Build error

App Files Files Community

StoriesComeAlive / app.py

nguyenbh

Update app

d5d48f0 verified 5 months ago

raw

history blame contribute delete

12.4 kB

	import gradio as gr
	import json
	import requests
	import urllib.request
	import os
	import ssl
	import base64
	import tempfile
	import edge_tts
	import re
	import logging
	from PIL import Image
	from io import BytesIO
	from typing import Dict, List, Optional, Tuple, Union

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Azure ML endpoint configuration - these should be set as environment variables
	url = os.getenv("AZURE_ENDPOINT")
	api_key = os.getenv("AZURE_API_KEY")


	def call_aml_endpoint(payload, url, api_key):
	"""Call Azure ML endpoint with the given payload."""
	# Allow self-signed HTTPS certificates
	def allow_self_signed_https(allowed):
	if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
	ssl._create_default_https_context = ssl._create_unverified_context

	allow_self_signed_https(True)

	# Set parameters (can be adjusted based on your needs)
	parameters = {"temperature": 0.7}
	if "parameters" not in payload["input_data"]:
	payload["input_data"]["parameters"] = parameters

	# Encode the request body
	body = str.encode(json.dumps(payload))

	if not api_key:
	raise Exception("A key should be provided to invoke the endpoint")

	# Set up headers
	headers = {'Content-Type': 'application/json', 'Authorization': ('Bearer ' + api_key)}

	# Create and send the request
	req = urllib.request.Request(url, body, headers)

	try:
	logger.info(f"Sending request to {url}")
	response = urllib.request.urlopen(req)
	result = response.read().decode('utf-8')
	logger.info("Received response successfully")
	return json.loads(result)
	except urllib.error.HTTPError as error:
	logger.error(f"Request failed with status code: {error.code}")
	logger.error(f"Headers: {error.info()}")
	error_message = error.read().decode("utf8", 'ignore')
	logger.error(f"Error message: {error_message}")
	return {"error": error_message}

	def encode_base64_from_file(file_path):
	"""Encode file content to base64 string and determine MIME type."""
	file_extension = os.path.splitext(file_path)[1].lower()

	# Map file extensions to MIME types
	if file_extension in ['.jpg', '.jpeg']:
	mime_type = "image/jpeg"
	elif file_extension == '.png':
	mime_type = "image/png"
	elif file_extension == '.gif':
	mime_type = "image/gif"
	elif file_extension in ['.bmp', '.tiff', '.webp']:
	mime_type = f"image/{file_extension[1:]}"
	else:
	mime_type = "image/jpeg" # Default to JPEG

	# Read and encode file content
	with open(file_path, "rb") as file:
	encoded_string = base64.b64encode(file.read()).decode('utf-8')

	return encoded_string, mime_type

	class ImageOCRApp:
	def __init__(self):
	"""Initialize the app with Azure ML endpoint configurations"""
	# Check if Azure endpoint and key are set
	if not url or not api_key:
	logger.warning("Azure ML endpoint or API key not set. Set AZURE_ENDPOINT and AZURE_API_KEY environment variables.")

	def recognize_text(self, image_path: str) -> str:
	"""Recognize text from the image using Azure ML endpoint"""
	try:
	# Encode image to base64
	base64_image, mime_type = encode_base64_from_file(image_path)

	# Prepare prompt for OCR
	ocr_prompt = "Please identify the handwritten text in the image."

	# Create content array for the payload
	content_items = [
	{"type": "text", "text": ocr_prompt},
	{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}}
	]

	# Create conversation state
	conversation_state = [
	{
	"role": "user",
	"content": content_items
	}
	]

	# Create the payload
	payload = {
	"input_data": {
	"input_string": conversation_state
	}
	}

	# Call Azure ML endpoint
	response = call_aml_endpoint(payload, url, api_key)

	# Extract text response from the Azure ML endpoint response
	if isinstance(response, dict):
	if "result" in response:
	result = response["result"]
	elif "output" in response:
	# Depending on your API's response format
	if isinstance(response["output"], list) and len(response["output"]) > 0:
	result = response["output"][0]
	else:
	result = str(response["output"])
	elif "error" in response:
	logger.error(f"Error from Azure ML endpoint: {response['error']}")
	result = f"Error: {response['error']}"
	else:
	# Just return the whole response as string if we can't parse it
	result = f"Received response: {json.dumps(response)}"
	else:
	result = str(response)

	return result

	except Exception as e:
	logger.error(f"Error recognizing text: {str(e)}", exc_info=True)
	return f"Error recognizing text: {str(e)}"

	async def text_to_speech(self, text: str, voice: str = "en-US-EricNeural") -> Optional[str]:
	"""Convert text to speech using Edge TTS"""
	if not text.strip():
	return None

	try:
	communicate = edge_tts.Communicate(text, voice)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)
	return tmp_path
	except Exception as e:
	logger.error(f"TTS Error: {str(e)}")
	return None

	def create_interface(self):
	"""Create the Gradio interface"""
	custom_css = """
	.container { max-width: 900px; margin: auto; }
	.input-section {
	background: #f8f9fa;
	padding: 20px;
	border-radius: 10px;
	margin-bottom: 20px;
	}
	.output-section {
	background: #ffffff;
	padding: 20px;
	border-radius: 10px;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	"""

	with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as interface:
	# Header
	gr.Markdown("""
	# ✨ Stories Come Alive
	### Transform handwritten moments into spoken memories

	Turn precious handwritten stories, notes, and drawings into living words.
	Whether it's a child's imaginative tale, a heartfelt letter, or a creative
	story - let's bring those special handwritten moments to life through sight
	and sound. 🎨📝🎧. Currently support English. Other demos include [Phi-4-Mini playground](https://huggingface.co/spaces/microsoft/phi-4-mini),
	[Thoughts Organizer](https://huggingface.co/spaces/microsoft/ThoughtsOrganizer),
	[Phine Speech Translator](https://huggingface.co/spaces/microsoft/PhineSpeechTranslator).
	""")

	with gr.Row():
	# Input section
	with gr.Column(scale=1):
	image_input = gr.Image(
	label="Upload or Capture Image",
	sources=["upload", "webcam"],
	type="filepath"
	)

	# Example selector
	gr.Markdown("### Try with Examples")
	example_images = [
	["content/kid.handwriting.draw.01.jpg", "Tiny Seed"],
	["content/race.for.the.moon.jpg", "To the Moon!"],
	["content/john.adam.move.to.dc.png", "Move to DC"],
	]
	gr.Examples(
	examples=example_images,
	inputs=image_input,
	label="Example Images"
	)

	with gr.Row():
	process_btn = gr.Button("🔍 Recognize Text", variant="primary")
	clear_btn = gr.Button("🗑️ Clear", variant="secondary")
	status_msg = gr.Markdown("Ready to process image...")

	# Output section
	with gr.Column(scale=1):
	recognized_text = gr.Textbox(
	label="Recognized Text",
	lines=5,
	# readonly=True
	)

	tts_audio = gr.Audio(
	label="Text-to-Speech Output",
	visible=True,
	interactive=False
	)

	# Event handlers
	async def process_image(image):
	if image is None:
	return "Please upload or capture an image.", None, "⚠️ Please provide an image"

	# Check if Azure ML endpoint and API key are set
	if not url or not api_key:
	return "Azure ML endpoint or API key not set. Please configure the environment variables.", None, "⚠️ Configuration error"

	# Recognize text using Azure ML endpoint
	text = self.recognize_text(image)

	if not text or text.strip() == "":
	return "No text was recognized in the image.", None, "⚠️ No text recognized"

	# Clean up text - replace newlines with spaces and remove multiple spaces
	cleaned_text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()

	# Generate audio immediately
	audio_path = await self.text_to_speech(cleaned_text)

	return text, audio_path, "✅ Text recognized and audio generated"

	def clear_inputs():
	return None, "", None, "Ready to process image..."

	process_btn.click(
	fn=process_image,
	inputs=[image_input],
	outputs=[
	recognized_text,
	tts_audio,
	status_msg
	],
	api_name="process_image"
	)

	clear_btn.click(
	fn=clear_inputs,
	inputs=[],
	outputs=[
	image_input,
	recognized_text,
	tts_audio,
	status_msg
	],
	api_name="clear_inputs"
	)

	# Instructions
	with gr.Accordion("ℹ️ How to Use", open=False):
	gr.Markdown("""
	1. Upload or Capture: Use your webcam or upload an image containing text
	2. Process: Click 'Recognize Text' to extract text from the image
	3. Listen: The audio will automatically play once text is recognized

	Note: The system works best with clear, well-lit images of handwritten text.

	### Configuration
	Before using this app, set these environment variables:
	- AZURE_ENDPOINT: Your Azure ML endpoint URL
	- AZURE_API_KEY: Your Azure ML API key
	""")

	return interface

	def run_app():
	app = ImageOCRApp()
	interface = app.create_interface()
	interface.launch(
	share=True,
	server_name="0.0.0.0",
	)

	if __name__ == "__main__":
	run_app()