Spaces:

nslaughter
/

flashcard-studio

Sleeping

App Files Files Community

Nathan Slaughter commited on Oct 6, 2024

Commit

82915e5

1 Parent(s): 8428312

move parse message

Browse files

Files changed (4) hide show

app/models.py +25 -1
app/pipeline.py +12 -35
app/processing.py +5 -7
tests/test_pipeline.py +2 -18

app/models.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Message(BaseModel):
     content: list[Card]
     @validator('content', pre=True)
-    def parse_content(cls, v):
         if isinstance(v, str):
             try:
                 content_list = json.loads(v)
@@ -44,3 +44,27 @@ class PydanticEncoder(json.JSONEncoder):
         if isinstance(obj, BaseModel):
             return obj.dict()
         return super().default(obj)

     content: list[Card]
     @validator('content', pre=True)
+    def parse_content(cls, v: str) -> 'Message':
         if isinstance(v, str):
             try:
                 content_list = json.loads(v)
         if isinstance(obj, BaseModel):
             return obj.dict()
         return super().default(obj)
+def parse_message(input_dict: dict[str, any]) -> Message:
+    try:
+        # Extract the role
+        role: str = input_dict['role']
+        # Parse the content
+        content: str = input_dict['content']
+        # If content is a string, try to parse it as JSON
+        if isinstance(content, str):
+            content = json.loads(content)
+        # Create Card objects from the content
+        cards = [Card(**item) for item in content]
+        # Create and return the Message object
+        return Message(role=role, content=cards)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in content: {str(e)}")
+    except ValidationError as e:
+        raise ValueError(f"Validation error: {str(e)}")
+    except KeyError as e:
+        raise ValueError(f"Missing required key: {str(e)}")

app/pipeline.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 import torch
 from transformers import pipeline
-from .models import Card, Message, ValidationError
 logger = logging.getLogger(__name__)
 logging.basicConfig(filename="pipeline.log", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
@@ -21,7 +21,7 @@ class Pipeline:
         self.device = self._determine_device()
         logger.info(f"device type: {self.device}")
         self.messages = [
-            {"role": "system", "content": """You are an expert flashcard creator. You always include a single knowledge item per flashcard.
             - You ALWAYS include a single knowledge item per flashcard.
             - You ALWAYS respond in valid JSON format.
@@ -38,47 +38,24 @@ class Pipeline:
     def extract_flashcards(self, content: str = "", max_new_tokens: int = 1024) -> str:
         user_prompt = {"role": "user", "content": content}
         self.messages.append(user_prompt)
-        response_message = self.torch_pipe(
-            self.messages,
-            max_new_tokens=max_new_tokens
-        )[0]["generated_text"][-1]
-        return response_message
     def generate_flashcards(self, output_format: str, content: str) -> str:
         response = self.extract_flashcards(content)
         return format_flashcards(output_format, response)
-    def _determine_device(self):
         if torch.cuda.is_available():
             return torch.device("cuda")
         elif torch.backends.mps.is_available():
             return torch.device("mps")
         else:
             return torch.device("cpu")
-def parse_message(input_dict: dict[str, any]) -> Message:
-    try:
-        # Extract the role
-        role: str = input_dict['role']
-        # Parse the content
-        content: str = input_dict['content']
-        # If content is a string, try to parse it as JSON
-        if isinstance(content, str):
-            content = json.loads(content)
-        # Create Card objects from the content
-        cards = [Card(**item) for item in content]
-        # Create and return the Message object
-        return Message(role=role, content=cards)
-    except json.JSONDecodeError as e:
-        logger.error(f"Invalid JSON in content: {str(e)}")
-        raise ValueError(f"Invalid JSON in content: {str(e)}")
-    except ValidationError as e:
-        logger.error(f"Validation error: {str(e)}")
-        raise ValueError(f"Validation error: {str(e)}")
-    except KeyError as e:
-        logger.error(f"Missing required key: {str(e)}")
-        raise ValueError(f"Missing required key: {str(e)}")

 import torch
 from transformers import pipeline
+from .processing import format_flashcards
 logger = logging.getLogger(__name__)
 logging.basicConfig(filename="pipeline.log", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
         self.device = self._determine_device()
         logger.info(f"device type: {self.device}")
         self.messages = [
+            {"role": "system", "content": """You are an expert flashcard creator.
             - You ALWAYS include a single knowledge item per flashcard.
             - You ALWAYS respond in valid JSON format.
     def extract_flashcards(self, content: str = "", max_new_tokens: int = 1024) -> str:
         user_prompt = {"role": "user", "content": content}
         self.messages.append(user_prompt)
+        try:
+            response_message = self.torch_pipe(
+                self.messages,
+                max_new_tokens=max_new_tokens
+            )[0]["generated_text"][-1]
+            return response_message
+        except Exception as e:
+            logger.error(f"Error extracting flashcards: {str(e)}")
+            raise ValueError(f"Error extraction flashcards: {str(e)}")
     def generate_flashcards(self, output_format: str, content: str) -> str:
         response = self.extract_flashcards(content)
         return format_flashcards(output_format, response)
+    def _determine_device(self) -> torch.device:
         if torch.cuda.is_available():
             return torch.device("cuda")
         elif torch.backends.mps.is_available():
             return torch.device("mps")
         else:
             return torch.device("cpu")

app/processing.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import os
 import pymupdf4llm
 def process_pdf(pdf_path: str) -> str:
     """
     Extracts text from a PDF file using pymupdf4llm.
@@ -28,14 +30,12 @@ def process_file(file_obj, output_format: str, pipeline) -> str:
     """
     file_path = file_obj.name
     file_ext = os.path.splitext(file_path)[1].lower()
     if file_ext == '.pdf':
         text = process_pdf(file_path)
     elif file_ext in ['.txt', '.md']:
         text = read_text_file(file_path)
     else:
         raise ValueError("Unsupported file type.")
     flashcards = pipeline.generate_flashcards(output_format, text)
     return flashcards
@@ -49,16 +49,14 @@ def process_text_input(output_format: str, input_text: str) -> str:
     flashcards = pipeline.generate_flashcards(output_format, input_text)
     return flashcards
-def format_flashcards(self, output_format: str, response: str) -> str:
     output = ""
     try :
         message = parse_message(response)
-        logger.debug("after parse_obj_as")
-    except ValidationError as e:
         raise e
     if output_format.lower() == "json":
-        output = message.content_to_json()
     elif output_format.lower() == "csv":
         output = message.content_to_csv()
     return output

 import os
 import pymupdf4llm
+from .models import parse_message
 def process_pdf(pdf_path: str) -> str:
     """
     Extracts text from a PDF file using pymupdf4llm.
     """
     file_path = file_obj.name
     file_ext = os.path.splitext(file_path)[1].lower()
     if file_ext == '.pdf':
         text = process_pdf(file_path)
     elif file_ext in ['.txt', '.md']:
         text = read_text_file(file_path)
     else:
         raise ValueError("Unsupported file type.")
     flashcards = pipeline.generate_flashcards(output_format, text)
     return flashcards
     flashcards = pipeline.generate_flashcards(output_format, input_text)
     return flashcards
+def format_flashcards(output_format: str, response: str) -> str:
     output = ""
     try :
         message = parse_message(response)
+    except Exception as e:
         raise e
     if output_format.lower() == "json":
+        output:str = message.content_to_json()
     elif output_format.lower() == "csv":
         output = message.content_to_csv()
     return output

tests/test_pipeline.py CHANGED Viewed

@@ -3,8 +3,8 @@ from unittest.mock import Mock, patch
 import json
 from io import StringIO
 from pydantic import ValidationError
-from app.pipeline import Pipeline, Message, Card, parse_message
-from app.models import PydanticEncoder
 # Tests for Pipeline class
 @pytest.fixture
@@ -13,22 +13,6 @@ def mock_pipeline():
         mock_pipe.return_value = Mock()
         yield Pipeline("mock_model")
-# def test_extract_flashcards(mock_pipeline):
-#     mock_pipeline.torch_pipe.return_value = [{"generated_text": [{"role": "assistant", "content": '[{"question": "Q", "answer": "A"}]'}]}]
-#     response = mock_pipeline.extract_flashcards("Test content")
-#     assert isinstance(response, dict)
-#     assert "content" in response
-# def test_format_flashcards_csv(mock_pipeline):
-#     response = {"role": "assistant", "content": '[{"question": "Q", "answer": "A"}]'}
-#     formatted = mock_pipeline.format_flashcards("csv", response)
-#     assert formatted.strip() == "Question,Answer\nQ,A"
-# def test_generate_flashcards(mock_pipeline):
-#     mock_pipeline.extract_flashcards.return_value = {"role": "assistant", "content": '[{"question": "Q", "answer": "A"}]'}
-#     result = mock_pipeline.generate_flashcards("json", "Test content")
-#     assert json.loads(result) == [{"question": "Q", "answer": "A"}]
 # Tests for parse_message function
 def test_parse_message_valid_input():
     input_dict = {

 import json
 from io import StringIO
 from pydantic import ValidationError
+from app.pipeline import Pipeline
+from app.models import PydanticEncoder, Message, Card, parse_message
 # Tests for Pipeline class
 @pytest.fixture
         mock_pipe.return_value = Mock()
         yield Pipeline("mock_model")
 # Tests for parse_message function
 def test_parse_message_valid_input():
     input_dict = {