Spaces:
Sleeping
Sleeping
File size: 2,389 Bytes
4d17caa 82915e5 b8d2f65 82915e5 4d17caa b8d2f65 4d17caa f5b41ec 4d17caa b8d2f65 4d17caa 2f264ab b8d2f65 4d17caa b8d2f65 4d17caa b8d2f65 4d17caa b8d2f65 4d17caa b8d2f65 4d17caa b8d2f65 4d17caa 8428312 82915e5 b8d2f65 8428312 82915e5 8428312 82915e5 8428312 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import pymupdf4llm
from .models import parse_message
from .pipeline import Pipeline
def process_pdf(pdf_path: str) -> str:
"""Extracts text from a PDF file using pymupdf4llm."""
try:
text = pymupdf4llm.to_markdown(pdf_path)
return text
except Exception as e:
raise ValueError(f"Error processing PDF: {str(e)}")
def read_text_file(file_path: str) -> str:
"""Reads text from a .txt or .md file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
except Exception as e:
raise ValueError(f"Error reading text file: {str(e)}")
def process_file(file_obj, output_format: str, pipeline) -> str:
"""Processes the uploaded file based on its type and extracts flashcards."""
file_path = file_obj.name
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
text = process_pdf(file_path)
elif file_ext in ['.txt', '.md']:
text = read_text_file(file_path)
else:
raise ValueError("Unsupported file type.")
flashcards = generate_flashcards(output_format, text)
return flashcards
def reduce_newlines(text: str) -> str:
"""Reduces consecutive newlines exceeding 2 to just 2."""
while "\n\n\n" in text:
text = text.replace("\n\n\n", "\n\n")
return text
def generate_flashcards(output_format: str, content: str) -> str:
"""
Generates flashcards from the content.
"""
content = reduce_newlines(content)
response = Pipeline().extract_flashcards(content)
return format_flashcards(output_format, response)
def process_text_input(input_text: str, output_format: str = "csv") -> str:
"""Processes the input text and extracts flashcards."""
if not input_text.strip():
raise ValueError("No text provided.")
pipeline = Pipeline()
flashcards = generate_flashcards(output_format, input_text)
return flashcards
def format_flashcards(output_format: str, response: str) -> str:
"""Formats the response into the desired output format."""
output = ""
try :
message = parse_message(response)
except Exception as e:
raise e
if output_format.lower() == "json":
output:str = message.content_to_json()
elif output_format.lower() == "csv":
output = message.content_to_csv()
return output
|