import os import pymupdf4llm def process_pdf(pdf_path: str) -> str: """ Extracts text from a PDF file using pymupdf4llm. """ try: text = pymupdf4llm.extract_text(pdf_path) return text except Exception as e: raise ValueError(f"Error processing PDF: {str(e)}") def read_text_file(file_path: str) -> str: """ Reads text from a .txt or .md file. """ try: with open(file_path, 'r', encoding='utf-8') as f: text = f.read() return text except Exception as e: raise ValueError(f"Error reading text file: {str(e)}") def format_prompt(output_format: str) -> str: """ Formats the prompt based on the output type. """ if output_format.lower() == "json": return """You only respond in JSON format. Follow the example below. EXAMPLE: [ {"question": "What is AI?", "answer": "Artificial Intelligence."}, {"question": "What is ML?", "answer": "Machine Learning."} ] """ elif output_format.lower() == "csv": return """You only respond with cards in CSV format. Follow the example below. EXAMPLE: "What is AI?", "Artificial Intelligence." "What is ML?", "Machine Learning." """ # def extract_flashcards(text: str, output_format: str, pipeline: str) -> str: # """ # Extracts flashcards from the input text using the LLM and formats them in CSV or JSON. # """ # prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard. # {format_prompt(output_format)} # Extract flashcards from the user's text: # {text} # Do not include the prompt or any other unnecessary information in the flashcards. # Do not include triple ticks (```) or any other code blocks in the flashcards. # """ # # TODO: # response = pipeline.generate_flashcards("json", prompt) # return response def process_file(file_obj, output_format: str, pipeline) -> str: """ Processes the uploaded file based on its type and extracts flashcards. """ file_path = file_obj.name file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.pdf': text = process_pdf(file_path) elif file_ext in ['.txt', '.md']: text = read_text_file(file_path) else: raise ValueError("Unsupported file type.") flashcards = pipeline.generate_flashcards(output_format, text) return flashcards def process_text_input(output_format: str, input_text: str) -> str: """ Processes the input text and extracts flashcards. """ if not input_text.strip(): raise ValueError("No text provided.") flashcards = pipeline.generate_flashcards(output_format, input_text) return flashcards