Spaces:
Sleeping
Sleeping
import os | |
import pymupdf4llm | |
def process_pdf(pdf_path: str) -> str: | |
""" | |
Extracts text from a PDF file using pymupdf4llm. | |
""" | |
try: | |
text = pymupdf4llm.extract_text(pdf_path) | |
return text | |
except Exception as e: | |
raise ValueError(f"Error processing PDF: {str(e)}") | |
def read_text_file(file_path: str) -> str: | |
""" | |
Reads text from a .txt or .md file. | |
""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
text = f.read() | |
return text | |
except Exception as e: | |
raise ValueError(f"Error reading text file: {str(e)}") | |
def format_prompt(output_format: str) -> str: | |
""" | |
Formats the prompt based on the output type. | |
""" | |
if output_format.lower() == "json": | |
return """You only respond in JSON format. Follow the example below. | |
EXAMPLE: | |
[ | |
{"question": "What is AI?", "answer": "Artificial Intelligence."}, | |
{"question": "What is ML?", "answer": "Machine Learning."} | |
] | |
""" | |
elif output_format.lower() == "csv": | |
return """You only respond with cards in CSV format. Follow the example below. | |
EXAMPLE: | |
"What is AI?", "Artificial Intelligence." | |
"What is ML?", "Machine Learning." | |
""" | |
# def extract_flashcards(text: str, output_format: str, pipeline: str) -> str: | |
# """ | |
# Extracts flashcards from the input text using the LLM and formats them in CSV or JSON. | |
# """ | |
# prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard. | |
# {format_prompt(output_format)} | |
# Extract flashcards from the user's text: | |
# {text} | |
# Do not include the prompt or any other unnecessary information in the flashcards. | |
# Do not include triple ticks (```) or any other code blocks in the flashcards. | |
# """ | |
# # TODO: | |
# response = pipeline.generate_flashcards("json", prompt) | |
# return response | |
def process_file(file_obj, output_format: str, pipeline) -> str: | |
""" | |
Processes the uploaded file based on its type and extracts flashcards. | |
""" | |
file_path = file_obj.name | |
file_ext = os.path.splitext(file_path)[1].lower() | |
if file_ext == '.pdf': | |
text = process_pdf(file_path) | |
elif file_ext in ['.txt', '.md']: | |
text = read_text_file(file_path) | |
else: | |
raise ValueError("Unsupported file type.") | |
flashcards = pipeline.generate_flashcards(output_format, text) | |
return flashcards | |
def process_text_input(output_format: str, input_text: str) -> str: | |
""" | |
Processes the input text and extracts flashcards. | |
""" | |
if not input_text.strip(): | |
raise ValueError("No text provided.") | |
flashcards = pipeline.generate_flashcards(output_format, input_text) | |
return flashcards | |