Spaces:

nslaughter
/

flashcard-studio

Sleeping

Nathan Slaughter

add pipeline method

2f264ab 8 months ago

2.8 kB

	import os
	import pymupdf4llm

	def process_pdf(pdf_path: str) -> str:
	"""
	Extracts text from a PDF file using pymupdf4llm.
	"""
	try:
	text = pymupdf4llm.extract_text(pdf_path)
	return text
	except Exception as e:
	raise ValueError(f"Error processing PDF: {str(e)}")

	def read_text_file(file_path: str) -> str:
	"""
	Reads text from a .txt or .md file.
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	text = f.read()
	return text
	except Exception as e:
	raise ValueError(f"Error reading text file: {str(e)}")

	def format_prompt(output_format: str) -> str:
	"""
	Formats the prompt based on the output type.
	"""
	if output_format.lower() == "json":
	return """You only respond in JSON format. Follow the example below.

	EXAMPLE:
	[
	{"question": "What is AI?", "answer": "Artificial Intelligence."},
	{"question": "What is ML?", "answer": "Machine Learning."}
	]
	"""
	elif output_format.lower() == "csv":
	return """You only respond with cards in CSV format. Follow the example below.

	EXAMPLE:
	"What is AI?", "Artificial Intelligence."
	"What is ML?", "Machine Learning."
	"""

	# def extract_flashcards(text: str, output_format: str, pipeline: str) -> str:
	# """
	# Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
	# """
	# prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.

	# {format_prompt(output_format)}


	# Extract flashcards from the user's text:

	# {text}

	# Do not include the prompt or any other unnecessary information in the flashcards.
	# Do not include triple ticks (```) or any other code blocks in the flashcards.
	# """
	# # TODO:
	# response = pipeline.generate_flashcards("json", prompt)
	# return response

	def process_file(file_obj, output_format: str, pipeline) -> str:
	"""
	Processes the uploaded file based on its type and extracts flashcards.
	"""
	file_path = file_obj.name
	file_ext = os.path.splitext(file_path)[1].lower()

	if file_ext == '.pdf':
	text = process_pdf(file_path)
	elif file_ext in ['.txt', '.md']:
	text = read_text_file(file_path)
	else:
	raise ValueError("Unsupported file type.")

	flashcards = pipeline.generate_flashcards(output_format, text)
	return flashcards

	def process_text_input(output_format: str, input_text: str) -> str:
	"""
	Processes the input text and extracts flashcards.
	"""
	if not input_text.strip():
	raise ValueError("No text provided.")

	flashcards = pipeline.generate_flashcards(output_format, input_text)
	return flashcards