flashcard-studio / app /processing.py
Nathan Slaughter
add pipeline method
2f264ab
raw
history blame
2.8 kB
import os
import pymupdf4llm
def process_pdf(pdf_path: str) -> str:
"""
Extracts text from a PDF file using pymupdf4llm.
"""
try:
text = pymupdf4llm.extract_text(pdf_path)
return text
except Exception as e:
raise ValueError(f"Error processing PDF: {str(e)}")
def read_text_file(file_path: str) -> str:
"""
Reads text from a .txt or .md file.
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
except Exception as e:
raise ValueError(f"Error reading text file: {str(e)}")
def format_prompt(output_format: str) -> str:
"""
Formats the prompt based on the output type.
"""
if output_format.lower() == "json":
return """You only respond in JSON format. Follow the example below.
EXAMPLE:
[
{"question": "What is AI?", "answer": "Artificial Intelligence."},
{"question": "What is ML?", "answer": "Machine Learning."}
]
"""
elif output_format.lower() == "csv":
return """You only respond with cards in CSV format. Follow the example below.
EXAMPLE:
"What is AI?", "Artificial Intelligence."
"What is ML?", "Machine Learning."
"""
# def extract_flashcards(text: str, output_format: str, pipeline: str) -> str:
# """
# Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
# """
# prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.
# {format_prompt(output_format)}
# Extract flashcards from the user's text:
# {text}
# Do not include the prompt or any other unnecessary information in the flashcards.
# Do not include triple ticks (```) or any other code blocks in the flashcards.
# """
# # TODO:
# response = pipeline.generate_flashcards("json", prompt)
# return response
def process_file(file_obj, output_format: str, pipeline) -> str:
"""
Processes the uploaded file based on its type and extracts flashcards.
"""
file_path = file_obj.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
text = process_pdf(file_path)
elif file_ext in ['.txt', '.md']:
text = read_text_file(file_path)
else:
raise ValueError("Unsupported file type.")
flashcards = pipeline.generate_flashcards(output_format, text)
return flashcards
def process_text_input(output_format: str, input_text: str) -> str:
"""
Processes the input text and extracts flashcards.
"""
if not input_text.strip():
raise ValueError("No text provided.")
flashcards = pipeline.generate_flashcards(output_format, input_text)
return flashcards