Spaces:

nslaughter
/

flashcard-studio

Sleeping

App Files Files Community

Nathan Slaughter commited on Oct 6, 2024

Commit

2f264ab

1 Parent(s): 4d17caa

add pipeline method

Browse files

Files changed (7) hide show

app/interface.py +8 -8
app/models.py +0 -31
app/pipeline.py +151 -0
app/processing.py +19 -23
tests/conftest.py +2 -2
tests/{test_models.py → test_pipeline.py} +4 -6
tests/test_processing.py +13 -15

app/interface.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import gradio as gr
-from .models import LanguageModel
 from .processing import process_file, process_text_input
 def create_interface():
     # Initialize the language model
-    language_model = LanguageModel()
     # Define the Output Format Selector
     output_format_selector = gr.Radio(
@@ -18,18 +18,18 @@ def create_interface():
     flashcard_output_file = gr.Textbox(
         label="Flashcards",
         lines=20,
-        placeholder="Extracted flashcards will appear here..."
     )
     flashcard_output_text = gr.Textbox(
         label="Flashcards",
         lines=20,
-        placeholder="Extracted flashcards will appear here..."
     )
     # Define the Gradio interface function for File Upload
     def handle_file_upload(file_obj, output_format):
         try:
-            flashcards = process_file(file_obj, output_format, language_model)
             return flashcards
         except ValueError as ve:
             return str(ve)
@@ -37,16 +37,16 @@ def create_interface():
     # Define the Gradio interface function for Text Input
     def handle_text_input(input_text, output_format):
         try:
-            flashcards = process_text_input(input_text, output_format, language_model)
             return flashcards
         except ValueError as ve:
             return str(ve)
     # Create the Gradio Tabs
     with gr.Blocks() as interface:
-        gr.Markdown("# Flashcard Extraction Tool")
         gr.Markdown(
-            "Extract flashcards from uploaded files or directly input text. Choose your preferred output format."
         )
         with gr.Tab("Upload File"):
             with gr.Row():

 import gradio as gr
+from .pipeline import Pipeline
 from .processing import process_file, process_text_input
 def create_interface():
     # Initialize the language model
+    language_model = Pipeline()
     # Define the Output Format Selector
     output_format_selector = gr.Radio(
     flashcard_output_file = gr.Textbox(
         label="Flashcards",
         lines=20,
+        placeholder="Your flashcards will appear here..."
     )
     flashcard_output_text = gr.Textbox(
         label="Flashcards",
         lines=20,
+        placeholder="Your flashcards will appear here..."
     )
     # Define the Gradio interface function for File Upload
     def handle_file_upload(file_obj, output_format):
         try:
+            flashcards = process_file(file_obj, output_format, Pipeline())
             return flashcards
         except ValueError as ve:
             return str(ve)
     # Define the Gradio interface function for Text Input
     def handle_text_input(input_text, output_format):
         try:
+            flashcards = process_text_input(input_text, output_format, Pipeline())
             return flashcards
         except ValueError as ve:
             return str(ve)
     # Create the Gradio Tabs
     with gr.Blocks() as interface:
+        gr.Markdown("# Flashcard Studio")
         gr.Markdown(
+            "Make flashcards from uploaded files or directly input text. Choose your preferred output format."
         )
         with gr.Tab("Upload File"):
             with gr.Row():

app/models.py DELETED Viewed

@@ -1,31 +0,0 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-class LanguageModel:
-    def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
-        self.device = self._determine_device()
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype="auto",
-            device_map="auto"
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-    def _determine_device(self):
-        if torch.cuda.is_available():
-            return torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            return torch.device("mps")
-        else:
-            return torch.device("cpu")
-    def generate_flashcards(self, prompt: str, max_new_tokens: int = 1024) -> str:
-        inputs = self.tokenizer(prompt, return_tensors='pt').to(self.model.device)
-        with torch.no_grad():
-            output_ids = self.model.generate(
-                inputs.input_ids,
-                max_new_tokens=max_new_tokens,
-                do_sample=True
-            )
-        response = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        return response

app/pipeline.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from io import StringIO
+import csv
+import json
+import logging
+import torch
+from transformers import pipeline
+from pydantic import BaseModel, ValidationError, validator
+logger = logging.getLogger(__name__)
+class Card(BaseModel):
+    question: str
+    answer: str
+class Message(BaseModel):
+    role: str
+    content: list[Card]
+    @validator('content', pre=True)
+    def parse_content(cls, v):
+        if isinstance(v, str):
+            try:
+                content_list = json.loads(v)
+                return content_list
+            except json.JSONDecodeError as e:
+                raise ValueError(f"Error decoding 'content' JSON: {e}") from e
+        return v
+    def content_to_json(self) -> str:
+        return json.dumps([card.dict() for card in self.content], indent=2)
+    def content_to_csv(self) -> str:
+        output = StringIO()
+        writer = csv.writer(output)
+        writer.writerow(['Question', 'Answer'])  # CSV Header
+        for card in self.content:
+            writer.writerow([card.question, card.answer])
+        return output.getvalue()
+class PydanticEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, BaseModel):
+            return obj.dict()
+        return super().default(obj)
+class Pipeline:
+    def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct"):
+        self.torch_pipe = pipeline(
+            "text-generation",
+            "Qwen/Qwen2.5-7B-Instruct",
+            torch_dtype="auto",
+            device_map="auto"
+        )
+        self.device = self._determine_device()
+        self.messages = [
+            {"role": "system", "content": """You are an expert flashcard creator. You always include a single knowledge item per flashcard.
+            - You ALWAYS include a single knowledge item per flashcard.
+            - You ALWAYS respond in valid JSON format.
+            Format responses like the example below.
+            EXAMPLE:
+            [
+                {"question": "What is AI?", "answer": "Artificial Intelligence."},
+                {"question": "What is ML?", "answer": "Machine Learning."}
+            ]
+            """},
+        ]
+    def extract_flashcards(self, content: str = "", max_new_tokens: int = 1024) -> str:
+        user_prompt = {"role": "user", "content": content}
+        self.messages.append(user_prompt)
+        response_message = self.torch_pipe(
+            self.messages,
+            max_new_tokens=max_new_tokens
+        )[0]["generated_text"][-1]
+        return response_message
+    def format_flashcards(self, output_format: str, response: str) -> str:
+        output = ""
+        try :
+            message = parse_message(response)
+            logger.debug("after parse_obj_as")
+        except ValidationError as e:
+            raise e
+        if output_format.lower() == "json":
+            output = message.content_to_json()
+        elif output_format.lower() == "csv":
+            output = message.content_to_csv()
+        return output
+    def generate_flashcards(self, output_format: str, content: str) -> str:
+        response = self.extract_flashcards(content)
+        return self.format_flashcards(output_format, response)
+    def parse_message(self, input_dict: dict[str, any]) -> Message:
+        try:
+            # Extract the role
+            role = input_dict['role']
+            # Parse the content
+            content = input_dict['content']
+            # If content is a string, try to parse it as JSON
+            if isinstance(content, str):
+                content = content.strip()
+                content = json.loads(content)
+            # Create Card objects from the content
+            cards = [Card(**item) for item in content]
+            # Create and return the Message object
+            return Message(role=role, content=cards)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in content: {str(e)}")
+        except ValidationError as e:
+            raise ValueError(f"Validation error: {str(e)}")
+        except KeyError as e:
+            raise ValueError(f"Missing required key: {str(e)}")
+    def _determine_device(self):
+        if torch.cuda.is_available():
+            return torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            return torch.device("mps")
+        else:
+            return torch.device("cpu")
+def parse_message(input_dict: dict[str, any]) -> Message:
+    try:
+        # Extract the role
+        role: str = input_dict['role']
+        # Parse the content
+        content: str = input_dict['content']
+        # If content is a string, try to parse it as JSON
+        if isinstance(content, str):
+            content = json.loads(content)
+        # Create Card objects from the content
+        cards = [Card(**item) for item in content]
+        # Create and return the Message object
+        return Message(role=role, content=cards)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON in content: {str(e)}")
+    except ValidationError as e:
+        raise ValueError(f"Validation error: {str(e)}")
+    except KeyError as e:
+        raise ValueError(f"Missing required key: {str(e)}")

app/processing.py CHANGED Viewed

@@ -27,13 +27,12 @@ def format_prompt(output_format: str) -> str:
     Formats the prompt based on the output type.
     """
     if output_format.lower() == "json":
-        return """You only respond with cards in JSON format. Follow the example below.
     EXAMPLE:
     [
         {"question": "What is AI?", "answer": "Artificial Intelligence."},
         {"question": "What is ML?", "answer": "Machine Learning."}
-        ...
     ]
     """
     elif output_format.lower() == "csv":
@@ -42,32 +41,29 @@ def format_prompt(output_format: str) -> str:
     EXAMPLE:
         "What is AI?", "Artificial Intelligence."
         "What is ML?", "Machine Learning."
-        ...
     """
-def extract_flashcards(text: str, output_format: str, language_model: str) -> str:
-    """
-    Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
-    """
-    prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.
-    {format_prompt(output_format)}
-    Extract flashcards from the user's text:
-    {text}
-    Do not include the prompt or any other unnecessary information in the flashcards.
-    Do not include triple ticks (```) or any other code blocks in the flashcards.
-    """
-    # TODO:
-    # see https://qwen.readthedocs.io/en/latest/inference/chat.html
-    # e.g. pipeline = pipeline("text-generation", model="Qwen/Qwen2.5-7B-Instruct")
-    response = language_model.generate_flashcards(prompt)
-    return response
-def process_file(file_obj, output_format: str, language_model) -> str:
     """
     Processes the uploaded file based on its type and extracts flashcards.
     """
@@ -81,15 +77,15 @@ def process_file(file_obj, output_format: str, language_model) -> str:
     else:
         raise ValueError("Unsupported file type.")
-    flashcards = extract_flashcards(text, output_format, language_model)
     return flashcards
-def process_text_input(input_text: str, output_format: str, language_model) -> str:
     """
     Processes the input text and extracts flashcards.
     """
     if not input_text.strip():
         raise ValueError("No text provided.")
-    flashcards = extract_flashcards(input_text, output_format, language_model)
     return flashcards

     Formats the prompt based on the output type.
     """
     if output_format.lower() == "json":
+        return """You only respond in JSON format. Follow the example below.
     EXAMPLE:
     [
         {"question": "What is AI?", "answer": "Artificial Intelligence."},
         {"question": "What is ML?", "answer": "Machine Learning."}
     ]
     """
     elif output_format.lower() == "csv":
     EXAMPLE:
         "What is AI?", "Artificial Intelligence."
         "What is ML?", "Machine Learning."
     """
+# def extract_flashcards(text: str, output_format: str, pipeline: str) -> str:
+#     """
+#     Extracts flashcards from the input text using the LLM and formats them in CSV or JSON.
+#     """
+#     prompt = f"""You are an expert flashcard creator. You always include a single knowledge item per flashcard.
+#     {format_prompt(output_format)}
+#     Extract flashcards from the user's text:
+#     {text}
+#     Do not include the prompt or any other unnecessary information in the flashcards.
+#     Do not include triple ticks (```) or any other code blocks in the flashcards.
+#     """
+#     # TODO:
+#     response = pipeline.generate_flashcards("json", prompt)
+#     return response
+def process_file(file_obj, output_format: str, pipeline) -> str:
     """
     Processes the uploaded file based on its type and extracts flashcards.
     """
     else:
         raise ValueError("Unsupported file type.")
+    flashcards = pipeline.generate_flashcards(output_format, text)
     return flashcards
+def process_text_input(output_format: str, input_text: str) -> str:
     """
     Processes the input text and extracts flashcards.
     """
     if not input_text.strip():
         raise ValueError("No text provided.")
+    flashcards = pipeline.generate_flashcards(output_format, input_text)
     return flashcards

tests/conftest.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import pytest
 from unittest.mock import Mock
-from app.models import LanguageModel
 @pytest.fixture
-def language_model():
     """
     Fixture to provide a mocked LanguageModel instance.
     """

 import pytest
 from unittest.mock import Mock
+from app.pipeline import LanguageModel
 @pytest.fixture
+def pipeline():
     """
     Fixture to provide a mocked LanguageModel instance.
     """

tests/{test_models.py → test_pipeline.py} RENAMED Viewed

@@ -1,8 +1,6 @@
-# tests/test_models.py
 import pytest
-def test_generate_flashcards(language_model, mocker):
     """
     Test the generate_flashcards method of LanguageModel.
     """
@@ -10,11 +8,11 @@ def test_generate_flashcards(language_model, mocker):
     expected_response = '{"flashcards": [{"Question": "What is AI?", "Answer": "Artificial Intelligence."}]}'
     # Configure the mock to return a specific response
-    language_model.generate_flashcards.return_value = expected_response
     # Call the method
-    response = language_model.generate_flashcards(prompt)
     # Assertions
     assert response == expected_response
-    language_model.generate_flashcards.assert_called_once_with(prompt)

 import pytest
+def test_generate_flashcards(pipeline, mocker):
     """
     Test the generate_flashcards method of LanguageModel.
     """
     expected_response = '{"flashcards": [{"Question": "What is AI?", "Answer": "Artificial Intelligence."}]}'
     # Configure the mock to return a specific response
+    pipeline.generate_flashcards.return_value = expected_response
     # Call the method
+    response = pipeline.generate_flashcards(prompt)
     # Assertions
     assert response == expected_response
+    pipeline.generate_flashcards.assert_called_once_with(prompt)

tests/test_processing.py CHANGED Viewed

@@ -1,9 +1,7 @@
-# tests/test_processing.py
 import pytest
 from app.processing import process_text_input, process_file
-def test_process_text_input_success(language_model):
     """
     Test processing of valid text input.
     """
@@ -11,11 +9,11 @@ def test_process_text_input_success(language_model):
     output_format = "JSON"
     expected_output = '{"flashcards": []}'
-    result = process_text_input(input_text, output_format, language_model)
     assert result == expected_output
-    language_model.generate_flashcards.assert_called_once()
-def test_process_text_input_empty(language_model):
     """
     Test processing of empty text input.
     """
@@ -23,10 +21,10 @@ def test_process_text_input_empty(language_model):
     output_format = "JSON"
     with pytest.raises(ValueError) as excinfo:
-        process_text_input(input_text, output_format, language_model)
     assert "No text provided." in str(excinfo.value)
-def test_process_file_unsupported_type(language_model, tmp_path):
     """
     Test processing of an unsupported file type.
     """
@@ -35,10 +33,10 @@ def test_process_file_unsupported_type(language_model, tmp_path):
     dummy_file.write_text("Unsupported content")
     with pytest.raises(ValueError) as excinfo:
-        process_file(dummy_file, "JSON", language_model)
     assert "Unsupported file type." in str(excinfo.value)
-def test_process_file_pdf(language_model, tmp_path, mocker):
     """
     Test processing of a PDF file.
     """
@@ -51,11 +49,11 @@ def test_process_file_pdf(language_model, tmp_path, mocker):
     expected_output = '{"flashcards": []}'
-    result = process_file(dummy_file, "JSON", language_model)
     assert result == expected_output
-    language_model.generate_flashcards.assert_called_once()
-def test_process_file_txt(language_model, tmp_path, mocker):
     """
     Test processing of a TXT file.
     """
@@ -68,6 +66,6 @@ def test_process_file_txt(language_model, tmp_path, mocker):
     expected_output = '{"flashcards": []}'
-    result = process_file(dummy_file, "JSON", language_model)
     assert result == expected_output
-    language_model.generate_flashcards.assert_called_once()

 import pytest
 from app.processing import process_text_input, process_file
+def test_process_text_input_success(pipeline):
     """
     Test processing of valid text input.
     """
     output_format = "JSON"
     expected_output = '{"flashcards": []}'
+    result = process_text_input(input_text, output_format, pipeline)
     assert result == expected_output
+    pipeline.generate_flashcards.assert_called_once()
+def test_process_text_input_empty(pipeline):
     """
     Test processing of empty text input.
     """
     output_format = "JSON"
     with pytest.raises(ValueError) as excinfo:
+        process_text_input(input_text, output_format, pipeline)
     assert "No text provided." in str(excinfo.value)
+def test_process_file_unsupported_type(pipeline, tmp_path):
     """
     Test processing of an unsupported file type.
     """
     dummy_file.write_text("Unsupported content")
     with pytest.raises(ValueError) as excinfo:
+        process_file(dummy_file, "JSON", pipeline)
     assert "Unsupported file type." in str(excinfo.value)
+def test_process_file_pdf(pipeline, tmp_path, mocker):
     """
     Test processing of a PDF file.
     """
     expected_output = '{"flashcards": []}'
+    result = process_file(dummy_file, "JSON", pipeline)
     assert result == expected_output
+    pipeline.generate_flashcards.assert_called_once()
+def test_process_file_txt(pipeline, tmp_path, mocker):
     """
     Test processing of a TXT file.
     """
     expected_output = '{"flashcards": []}'
+    result = process_file(dummy_file, "JSON", pipeline)
     assert result == expected_output
+    pipeline.generate_flashcards.assert_called_once()