Spaces:

CandidAI
/

ask-candid

Running

App Files Files Community

brainsqueeze commited on about 1 month ago

Commit

f86d7f2

verified ·

1 Parent(s): cc80c3d

Smarter document context retrieval

Browse files

* Retrieved documents re-ranking w/ SPLADE-v3
* Enable news as a default source

Files changed (14) hide show

app.py +2 -1
ask_candid/agents/elastic.py +246 -54
ask_candid/retrieval/elastic.py +39 -160
ask_candid/retrieval/sources/candid_blog.py +22 -1
ask_candid/retrieval/sources/candid_help.py +20 -1
ask_candid/retrieval/sources/candid_learning.py +22 -1
ask_candid/retrieval/sources/candid_news.py +14 -1
ask_candid/retrieval/sources/issuelab.py +27 -2
ask_candid/retrieval/sources/schema.py +12 -1
ask_candid/retrieval/sources/utils.py +47 -0
ask_candid/retrieval/sources/youtube.py +20 -1
ask_candid/retrieval/sparse_lexical.py +14 -4
ask_candid/tools/elastic/index_search_tool.py +9 -2
ask_candid/tools/question_reformulation.py +43 -39

app.py CHANGED Viewed

@@ -113,7 +113,8 @@ def build_rag_chat() -> Tuple[LoggedComponents, gr.Blocks]:
         with gr.Accordion(label="Advanced settings", open=False):
             es_indices = gr.CheckboxGroup(
                 choices=list(ALL_INDICES),
-                value=[idx for idx in ALL_INDICES if "news" not in idx],
                 label="Sources to include",
                 interactive=True,
             )

         with gr.Accordion(label="Advanced settings", open=False):
             es_indices = gr.CheckboxGroup(
                 choices=list(ALL_INDICES),
+                # value=[idx for idx in ALL_INDICES if "news" not in idx],
+                value=list(ALL_INDICES),
                 label="Sources to include",
                 interactive=True,
             )

ask_candid/agents/elastic.py CHANGED Viewed

@@ -2,6 +2,9 @@ from typing import TypedDict, List
 from functools import partial
 import json
 import ast
 from pydantic import BaseModel, Field
 from langchain_core.runnables import RunnableSequence
@@ -24,10 +27,118 @@ from ask_candid.tools.elastic.index_search_tool import create_search_tool
 tools = [
     IndexShowDataTool(),
     IndexDetailsTool(),
-    create_search_tool(),
 ]
 class GraphState(TypedDict):
     query: str = Field(
         ..., description="The user's query to be processed by the system."
@@ -47,6 +158,7 @@ class GraphState(TypedDict):
         ...,
         description="The Elasticsearch query result generated or used by the agent.",
     )
 class AnalysisResult(BaseModel):
@@ -334,8 +446,6 @@ def build_compute_graph(llm: LLM) -> StateGraph:
 class ElasticGraph(StateGraph):
-    """Elastic Seach Agent State Graph"""
     llm: LLM
     tools: List[Tool]
@@ -345,6 +455,41 @@ class ElasticGraph(StateGraph):
         self.tools = tools
         self.construct_graph()
     def agent_factory(self) -> AgentExecutor:
         """
         Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
@@ -387,7 +532,7 @@ class ElasticGraph(StateGraph):
             return_intermediate_steps=True,
         )
-    def agent_factory_claude(self) -> AgentExecutor:
         """
         Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
@@ -400,15 +545,6 @@ class ElasticGraph(StateGraph):
             AgentExecutor: Configured agent ready to execute tasks with specified tools,
                         providing detailed intermediate steps for transparency.
         """
-        prefix = """
-        You are an intelligent agent tasked with generating accurate Elasticsearch DSL queries.
-        Analyze the intent behind the query and determine the appropriate Elasticsearch operations required.
-        Guidelines for generating right elastic seach query:
-            1. Automatically determine whether to return document hits or aggregation results based on the query structure.
-            2. Use keyword fields instead of text fields for aggregations and sorting to avoid fielddata errors
-            3. Avoid using field.keyword if a keyword field is already present to prevent redundant queries.
-            4. Ensure efficient query execution by selecting appropriate query types for filtering, searching, and aggregating.
-                """
         prompt = ChatPromptTemplate.from_messages(
             [
                 ("system", f"You are a helpful elasticsearch assistant. {prefix}"),
@@ -418,9 +554,19 @@ class ElasticGraph(StateGraph):
             ]
         )
-        agent = create_tool_calling_agent(self.llm, self.tools, prompt)
         agent_executor = AgentExecutor.from_agent_and_tools(
-            agent=agent, tools=self.tools, verbose=True, return_intermediate_steps=True
         )
         # Create the agent
         return agent_executor
@@ -467,6 +613,8 @@ class ElasticGraph(StateGraph):
     def grant_index_agent(self, state: GraphState) -> GraphState:
         print("> Grant Index Agent")
         input_data = {
             "input": f"""
             You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
@@ -479,52 +627,51 @@ class ElasticGraph(StateGraph):
                    Users may not always provide the exact name, so the Elasticsearch query should accommodate partial or incomplete names
                    by searching for relevant keywords.
                 6. Present the response in a clear and natural language format, addressing the user's question directly.
             Description of some of the fields in the index but rest of the fields which are not here should be easy to understand:
-            fiscal_year: Year when grantmaker allocates budget for funding and grants. format YYYY
-            text: Objectives,mission, program and funding related information
-            Program_area: program area where organization is working on
-            Title: the title of the funding
-            pcs_v3: PCS is taxonomy, describing the work of grantmakers, recipient organizations and the philanthropic transactions between those entities.
-            The facets of the PCS illuminate the work and answer the following questions about philanthropy:
-                Who? = Population Served
-                What? = Subject and Organization Type
-                How? = Support Strategy and Transaction Type
-                the Facets:
-                Subjects: Describes WHAT is being supported. Example: Elementary education or Clean water supply.
-                Populations: Describes WHO is being supported. Example: Girls or People with disabilities.
-                Organization Type: Describes WHAT type of organization is providing or receiving support.
-                Transaction Type: Describes HOW support is being provided.
-                Support Strategies: Describes HOW activities are being implemented.
-                pcs_v3 itself is in a json format:
-                   key - subject
-                    value: it is a list of dictionary so might need to loop around to find the particular aspect
-                        hierarchy: (it is a list having subject name)
-                            [
-                            {{
-                            'name':
-                            }},
-                            {{
-                            'name':
-                            }}
-                            ]
-            Before Writing elastic search query think through which field to use
-            Note: first you should focus on query  `text` then look into pcs_v3. Make sure you pick the right size for the query
             User's query:
             ```{state["query"]}```
             """
         }
-        agent_exec = self.agent_factory_claude()
         res = agent_exec.invoke(input_data)
         state["agent_out"] = res["output"]
         es_queries, es_results = {}, {}
         for i, action in enumerate(res.get("intermediate_steps", []), start=1):
             if action[0].tool == "elastic_index_search_tool":
                 es_queries[f"query_{i}"] = json.loads(
                     action[0].tool_input.get("query") or "{}"
                 )
@@ -550,6 +697,18 @@ class ElasticGraph(StateGraph):
         """
         print("> Org Index Agent")
         input_data = {
             "input": f"""
             You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
@@ -557,14 +716,45 @@ class ElasticGraph(StateGraph):
                 1. Understand the user query to determine the required information.
                 2. Query the indices in the Elasticsearch database.
                 3. Retrieve the mappings and field names relevant to the query.
-                4. Use the `organization_qa_2` index to extract the necessary data.
                 5. Present the response in a clear and natural language format, addressing the user's question directly.
-            User's quer:
             ```{state["query"]}```
             """
         }
-        agent_exec = self.agent_factory_claude()
         res = agent_exec.invoke(input_data)
         state["agent_out"] = res["output"]
@@ -622,13 +812,15 @@ class ElasticGraph(StateGraph):
         """
         # Add nodes
         self.add_node("analyse", self.analyse_query)
         self.add_node("grant-index", self.grant_index_agent)
         self.add_node("org-index", self.org_index_agent)
         self.add_node("final_answer", self.final_answer)
         # Set entry point
-        self.set_entry_point("analyse")
         # Add conditional edges
         self.add_conditional_edges(

 from functools import partial
 import json
 import ast
+from ask_candid.base.api_base import BaseAPI
+import os
+import pandas as pd
 from pydantic import BaseModel, Field
 from langchain_core.runnables import RunnableSequence
 tools = [
     IndexShowDataTool(),
     IndexDetailsTool(),
+    create_search_tool(pcs_codes={}),
 ]
+class AutocodingAPI(BaseAPI):
+    def __init__(self):
+        super().__init__(
+            url=os.getenv("AUTOCODING_API_URL"),
+            headers={
+                "x-api-key": os.getenv("AUTOCODING_API_KEY"),
+                "Content-Type": "application/json",
+            },
+        )
+    def __call__(self, text: str, taxonomy: str = "pcs-v3"):
+        params = {"text": text, "taxonomy": taxonomy}
+        return self.get(**params)
+def find_subject_levels(filtered_df, subject_level_i, target_value):
+    """
+    Filters the DataFrame from the last valid NaN in 'Subject Level i' and retrieves corresponding values for lower levels.
+    Parameters:
+        filtered_df (pd.DataFrame): The input DataFrame.
+        subject_level_i (int): The subject level to filter from (1 to 4).
+        target_value (str): The value to search for in 'Subject Level i'.
+    Returns:
+        dict: A dictionary containing values for 'Subject Level i' to 'Subject Level 1'.
+        pd.DataFrame: The filtered DataFrame from the determined start index to the target_value row.
+    """
+    if subject_level_i < 1 or subject_level_i > 4:
+        raise ValueError("subject_level_i should be between 1 and 4")
+    # Define the target column dynamically
+    target_column = f"Subject Level {subject_level_i}"
+    # Find indices where the target column has the target value
+    target_indices = filtered_df[
+        filtered_df[target_column].astype(str).str.strip() == target_value
+    ].index
+    if target_indices.empty:
+        return {}, pd.DataFrame()  # Return empty if target_value is not found
+    # Get the first occurrence of the target value
+    first_target_index = target_indices[0]
+    # Initialize dictionary to store subject level values
+    subject_level_values = {target_column: target_value}
+    # Initialize subject level start index
+    subject_level_start = first_target_index
+    # Find the last non-NaN row for each subject level
+    for level in range(subject_level_i - 1, 0, -1):  # Loop from subject_level_i-1 to 1
+        column_name = f"Subject Level {level}"
+        # Start checking above the previous found index
+        current_index = subject_level_start - 1
+        while current_index >= 0 and pd.isna(
+            filtered_df.loc[current_index, column_name]
+        ):
+            current_index -= 1  # Move up while NaN is found
+        # Move one row down to get the last valid row in 'Subject Level level'
+        subject_level_start = current_index + 1
+        # Ensure we store the correct value at each subject level
+        if subject_level_start in filtered_df.index:
+            subject_level_values[column_name] = filtered_df.loc[
+                subject_level_start - 1, column_name
+            ]
+    # Ensure valid slicing range
+    min_start_index = subject_level_start
+    if min_start_index < first_target_index:
+        filtered_df = filtered_df.loc[min_start_index:first_target_index]
+    else:
+        filtered_df = pd.DataFrame()
+    return subject_level_values, filtered_df
+def extract_heirarchy(full_code, target_value):
+    # df = pd.read_excel(
+    #     r"C:\Users\mukul.rawat\OneDrive - Candid\Documents\Projects\Gen AI\azure_devops\ask-candid-assistant\PCS_Taxonomy_Definitions_2024.xlsx"
+    # )
+    df = pd.read_excel(r"C:\Users\siqi.deng\Downloads\PCS_Taxonomy_Definitions_2024.xlsx")
+    filtered_df = df[df["PCS Code"].str.startswith(full_code[:2], na=False)]
+    for i in range(1, 5):
+        column_name = f"Subject Level {i}"
+        if (df[column_name].str.strip() == target_value).any():
+            break
+    subject_level_values, filtered_df = find_subject_levels(
+        filtered_df, i, target_value
+    )
+    sorted_values = [
+        value
+        for key, value in sorted(
+            subject_level_values.items(), key=lambda x: int(x[0].split()[-1])
+        )
+    ]
+    # Joining values in the required format
+    result = " : ".join(sorted_values)
+    return result
 class GraphState(TypedDict):
     query: str = Field(
         ..., description="The user's query to be processed by the system."
         ...,
         description="The Elasticsearch query result generated or used by the agent.",
     )
+    pcs_codes: dict = Field(..., description="pcs codes")
 class AnalysisResult(BaseModel):
 class ElasticGraph(StateGraph):
     llm: LLM
     tools: List[Tool]
         self.tools = tools
         self.construct_graph()
+    def Extract_PCS_Codes(self, state):
+        """Todo: Add Subject heirarchies, Population, Geo"""
+        print("query", state["query"])
+        autocoding_api = AutocodingAPI()
+        autocoding_response = autocoding_api(text=state["query"]).get("data", {})
+        # population_served = autocoding_response.get("population", {})
+        subjects = autocoding_response.get("subject", {})
+        descriptions = []
+        heirarchy_string = []
+        if subjects and isinstance(subjects, list) and "description" in subjects[0]:
+            for subject in subjects:
+                # if subject['description'] in subjects_list:
+                descriptions.append(subject["description"])
+                heirarchy_string.append(
+                    extract_heirarchy(subject["full_code"], subject["description"])
+                )
+        print("descriptions", descriptions)
+        populations = autocoding_response.get("population", {})
+        population_dict = []
+        if (
+            populations
+            and isinstance(populations, list)
+            and "description" in populations[0]
+        ):
+            for population in populations:
+                population_dict.append(population["description"])
+        state["pcs_codes"] = {
+            "subject": descriptions,
+            "heirarchy_string": heirarchy_string,
+            "population": population_dict,
+        }
+        print("pcs_codes_new", state["pcs_codes"])
+        return state
     def agent_factory(self) -> AgentExecutor:
         """
         Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
             return_intermediate_steps=True,
         )
+    def agent_factory_claude(self, pcs_codes, prefix) -> AgentExecutor:
         """
         Creates and configures an AgentExecutor instance for interacting with Elasticsearch.
             AgentExecutor: Configured agent ready to execute tasks with specified tools,
                         providing detailed intermediate steps for transparency.
         """
         prompt = ChatPromptTemplate.from_messages(
             [
                 ("system", f"You are a helpful elasticsearch assistant. {prefix}"),
             ]
         )
+        tools = [
+            # ListIndicesTool(),
+            IndexShowDataTool(),
+            IndexDetailsTool(),
+            create_search_tool(pcs_codes=pcs_codes),
+        ]
+        agent = create_tool_calling_agent(self.llm, tools, prompt)
         agent_executor = AgentExecutor.from_agent_and_tools(
+            agent=agent,
+            tools=tools,
+            verbose=True,
+            return_intermediate_steps=True,
         )
         # Create the agent
         return agent_executor
     def grant_index_agent(self, state: GraphState) -> GraphState:
         print("> Grant Index Agent")
+        # autocoding test
         input_data = {
             "input": f"""
             You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
                    Users may not always provide the exact name, so the Elasticsearch query should accommodate partial or incomplete names
                    by searching for relevant keywords.
                 6. Present the response in a clear and natural language format, addressing the user's question directly.
             Description of some of the fields in the index but rest of the fields which are not here should be easy to understand:
+                *fiscal_year: Year when grantmaker allocates budget for funding and grants. format YYYY
+                *recipient_state: is abbreviated for eg. NY, FL, CA
+                *recipient_city - Full Name of the City e.g, New York City, Boston
+                *recipient_country - Country Abbreviation of the recipient organization e.g USA
+            Note: Do not include `title`, `program_area`, `text` field in the elastic search query
             User's query:
             ```{state["query"]}```
             """
         }
+        pcs_codes = state["pcs_codes"]
+        pcs_match_term = ""
+        for pcs_code in pcs_codes["subject"]:
+            if pcs_code != "Philanthropy":
+                pcs_match_term += f"*'pcs_v3.subject.value.name': {pcs_code}* \n"
+        for pcs_code in pcs_codes["population"]:
+            if pcs_code != "Other population":
+                pcs_match_term += f"*'pcs_v3.population.value.name': {pcs_code}* \n"
+        print("pcs_match_term", pcs_match_term)
+        prefix = f"""
+        You are an intelligent agent tasked with generating accurate Elasticsearch DSL queries.
+        Analyze the intent behind the query and determine the appropriate Elasticsearch operations required.
+        Guidelines for generating right elastic seach query:
+            1. Automatically determine whether to return document hits or aggregation results based on the query structure.
+            2. Use keyword fields instead of text fields for aggregations and sorting to avoid fielddata errors
+            3. Avoid using field.keyword if a keyword field is already present to prevent redundant queries.
+            4. Ensure efficient query execution by selecting appropriate query types for filtering, searching, and aggregating.
+         Instruction for pcs_v3 Field-
+            If {pcs_codes['subject']} not empty:
+            Only include all of the following match terms. No other pcs_v3 fields should be added, duplicated, or altered except for those listed below.
+                - {pcs_match_term}
+        """
+        agent_exec = self.agent_factory_claude(
+            pcs_codes=state["pcs_codes"], prefix=prefix
+        )
         res = agent_exec.invoke(input_data)
         state["agent_out"] = res["output"]
         es_queries, es_results = {}, {}
         for i, action in enumerate(res.get("intermediate_steps", []), start=1):
             if action[0].tool == "elastic_index_search_tool":
+                print("query", action[0].tool_input.get("query"))
                 es_queries[f"query_{i}"] = json.loads(
                     action[0].tool_input.get("query") or "{}"
                 )
         """
         print("> Org Index Agent")
+        mapping_description = """
+                    "admin1_code": "state abbreviation"
+                    "admin1_description": "Full name/label of the state"
+                    "city": Full Name of the city with 1st letter being capital for e.g. New York City
+                    "assets": "The assets value of the most recent fiscals available for the organization."
+                    "country_code": "Country abbreviation"
+                    "country_name": "Country name"
+                    "fiscal_year": "The year of the most recent fiscals available for the organization. (YYYY format)"
+                    "mission_statement": "The mission statement of the organization."
+                    "roles": "grantmaker: Indicates the organization gives grants., recipient: Indicates the organization receives grants., company: Indicates the organization is a company/corporation."
+            """
         input_data = {
             "input": f"""
             You are an Elasticsearch database agent designed to accurately understand and respond to user queries. Follow these steps:
                 1. Understand the user query to determine the required information.
                 2. Query the indices in the Elasticsearch database.
                 3. Retrieve the mappings and field names relevant to the query.
+                4. Use the `organization_qa_ds1` index to extract the necessary data.
                 5. Present the response in a clear and natural language format, addressing the user's question directly.
+            Given Below is mapping description of some of the fields
+            ```{mapping_description}```
+            User's query:
             ```{state["query"]}```
             """
         }
+        pcs_codes = state["pcs_codes"]
+        pcs_match_term = ""
+        for pcs_code in pcs_codes["subject"]:
+            pcs_match_term += f'"taxonomy_descriptions": "{pcs_code}"  \n"'
+        print("pcs_match_term", pcs_match_term)
+        prefix = f"""You are an intelligent agent tasked with generating accurate Elasticsearch DSL queries.
+        Analyze the intent behind the query and determine the appropriate Elasticsearch operations required.
+        Guidelines for generating right elastic seach query:
+            1. Automatically determine whether to return document hits or aggregation results based on the query structure.
+            2. Use keyword fields instead of text fields for aggregations and sorting to avoid fielddata errors
+            3. Avoid using field.keyword if a keyword field is already present to prevent redundant queries.
+            4. Ensure efficient query execution by selecting appropriate query types for filtering, searching, and aggregating.
+            Instructions to use `taxonomy_descriptions` field:
+            If {pcs_codes['subject']} not empty, only add the following match term:
+                    Only add the following `match` term, No other `taxonomy_descriptions` fields should be added, duplicated, or modified except belowIf {pcs_codes['subject']} not empty,
+                    - {pcs_match_term}
+            Avoid using `ntee_major_description` field in the es query
+            """
+        agent_exec = self.agent_factory_claude(
+            pcs_codes=state["pcs_codes"], prefix=prefix
+        )
         res = agent_exec.invoke(input_data)
         state["agent_out"] = res["output"]
         """
         # Add nodes
+        self.add_node("Context_Extraction", self.Extract_PCS_Codes)
         self.add_node("analyse", self.analyse_query)
         self.add_node("grant-index", self.grant_index_agent)
         self.add_node("org-index", self.org_index_agent)
         self.add_node("final_answer", self.final_answer)
         # Set entry point
+        self.set_entry_point("Context_Extraction")
+        self.add_edge("Context_Extraction", "analyse")
         # Add conditional edges
         self.add_conditional_edges(

ask_candid/retrieval/elastic.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
-from dataclasses import dataclass
 from itertools import groupby
 from torch.nn import functional as F
@@ -10,12 +9,14 @@ from langchain_core.documents import Document
 from elasticsearch import Elasticsearch
 from ask_candid.retrieval.sparse_lexical import SpladeEncoder
-from ask_candid.retrieval.sources.issuelab import IssueLabConfig
-from ask_candid.retrieval.sources.youtube import YoutubeConfig
-from ask_candid.retrieval.sources.candid_blog import CandidBlogConfig
-from ask_candid.retrieval.sources.candid_learning import CandidLearningConfig
-from ask_candid.retrieval.sources.candid_help import CandidHelpConfig
-from ask_candid.retrieval.sources.candid_news import CandidNewsConfig
 from ask_candid.services.small_lm import CandidSLM
 from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
 from ask_candid.base.config.data import DataIndices, ALL_INDICES
@@ -23,17 +24,6 @@ from ask_candid.base.config.data import DataIndices, ALL_INDICES
 encoder = SpladeEncoder()
-@dataclass
-class ElasticHitsResult:
-    """Dataclass for Elasticsearch hits results
-    """
-    index: str
-    id: Any
-    score: float
-    source: Dict[str, Any]
-    inner_hits: Dict[str, Any]
 class RetrieverInput(BaseModel):
     """Input to the Elasticsearch retriever."""
     user_input: str = Field(description="query to look up in retriever")
@@ -101,7 +91,7 @@ def news_query_builder(query: str) -> Dict[str, Any]:
     tokens = encoder.token_expand(query)
     query = {
-        "_source": ["id", "link", "title", "content"],
         "query": {
             "bool": {
                 "filter": [
@@ -150,27 +140,27 @@ def query_builder(query: str, indices: List[DataIndices]) -> Tuple[List[Dict[str
         if index == "issuelab":
             q = build_sparse_vector_query(query=query, fields=IssueLabConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 1
             queries.extend([{"index": IssueLabConfig.index_name}, q])
         elif index == "youtube":
             q = build_sparse_vector_query(query=query, fields=YoutubeConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings", *YoutubeConfig.excluded_fields]}
-            q["size"] = 2
             queries.extend([{"index": YoutubeConfig.index_name}, q])
         elif index == "candid_blog":
             q = build_sparse_vector_query(query=query, fields=CandidBlogConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
             queries.extend([{"index": CandidBlogConfig.index_name}, q])
         elif index == "candid_learning":
             q = build_sparse_vector_query(query=query, fields=CandidLearningConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
             queries.extend([{"index": CandidLearningConfig.index_name}, q])
         elif index == "candid_help":
             q = build_sparse_vector_query(query=query, fields=CandidHelpConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
-            q["size"] = 2
             queries.extend([{"index": CandidHelpConfig.index_name}, q])
         elif index == "news":
             q = news_query_builder(query=query)
@@ -199,12 +189,18 @@ def multi_search(
     def _msearch_response_generator(responses: List[Dict[str, Any]]) -> Iterator[ElasticHitsResult]:
         for query_group in responses:
             for h in query_group.get("hits", {}).get("hits", []):
                 yield ElasticHitsResult(
                     index=h["_index"],
                     id=h["_id"],
                     score=h["_score"],
                     source=h["_source"],
-                    inner_hits=h.get("inner_hits", {})
                 )
     results = []
@@ -264,6 +260,10 @@ def retrieved_text(hits: Dict[str, Any]) -> str:
     text = []
     for _, v in hits.items():
         for h in (v.get("hits", {}).get("hits") or []):
             for _, field in h.get("fields", {}).items():
                 for chunk in field:
@@ -298,7 +298,8 @@ def cosine_rescore(query: str, contexts: List[str]) -> List[float]:
 def reranker(
     query_results: Iterable[ElasticHitsResult],
-    search_text: Optional[str] = None
 ) -> Iterator[ElasticHitsResult]:
     """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
     This will shuffle results
@@ -327,58 +328,13 @@ def reranker(
                 text = retrieved_text(d.inner_hits)
                 texts.append(text)
-    # if search_text and len(texts) == len(results):
-    #     scores = cosine_rescore(search_text, texts)
-    #     for r, s in zip(results, scores):
-    #         r.score = s
-    yield from sorted(results, key=lambda x: x.score, reverse=True)
-def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
-    """Pads the relevant chunk of text with context before and after
-    Parameters
-    ----------
-    field_name : str
-        a field with the long text that was chunked into pieces
-    hit : ElasticHitsResult
-    context_length : int, optional
-        length of text to add before and after the chunk, by default 1024
-    Returns
-    -------
-    str
-        longer chunks stuffed together
-    """
-    chunks = []
-    # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
-    long_text = hit.source.get(f"{field_name}", "")
-    long_text = long_text.lower()
-    inner_hits_field = f"embeddings.{field_name}.chunks"
-    found_chunks = hit.inner_hits.get(inner_hits_field, {})
-    if found_chunks:
-        hits = found_chunks.get("hits", {}).get("hits", [])
-        for h in hits:
-            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
-            # cutting the middle because we may have tokenizing artifacts there
-            chunk = chunk[3: -3]
-            if add_context:
-                # Find the start and end indices of the chunk in the large text
-                start_index = long_text.find(chunk[:20])
-                # Chunk is found
-                if start_index != -1:
-                    end_index = start_index + len(chunk)
-                    pre_start_index = max(0, start_index - context_length)
-                    post_end_index = min(len(long_text), end_index + context_length)
-                    chunks.append(long_text[pre_start_index:post_end_index])
-            else:
-                chunks.append(chunk)
-    return '\n\n'.join(chunks)
 def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
@@ -394,94 +350,17 @@ def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
     """
     if "issuelab-elser" in hit.index:
-        combined_item_description = hit.source.get("combined_item_description", "") # title inside
-        description = hit.source.get("description", "")
-        combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
-        # we only need to process long texts
-        chunks_with_context_txt = get_context("content", hit, context_length=12)
-        doc = Document(
-            page_content='\n\n'.join([
-                combined_item_description,
-                combined_issuelab_findings,
-                description,
-                chunks_with_context_txt
-            ]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "IssueLab",
-                "source_id": hit.source["resource_id"],
-                "url": hit.source.get("permalink", "")
-            }
-        )
     elif "youtube" in hit.index:
-        title = hit.source.get("title", "")
-        # we only need to process long texts
-        description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
-        captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
-        doc = Document(
-            page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
-            metadata={
-                "title": title,
-                "source": "Candid YouTube",
-                "source_id": hit.source['video_id'],
-                "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
-            }
-        )
     elif "candid-blog" in hit.index:
-        excerpt = hit.source.get("excerpt", "")
-        title = hit.source.get("title", "")
-        # we only need to process long text
-        content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
-        authors = get_context("authors_text", hit, context_length=12, add_context=False)
-        tags = hit.source.get("title_summary_tags", "")
-        doc = Document(
-            page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
-            metadata={
-                "title": title,
-                "source": "Candid Blog",
-                "source_id": hit.source["id"],
-                "url": hit.source["link"]
-            }
-        )
     elif "candid-learning" in hit.index:
-        title = hit.source.get("title", "")
-        content_with_context_txt = get_context("content", hit, context_length=12)
-        training_topics = hit.source.get("training_topics", "")
-        staff_recommendations = hit.source.get("staff_recommendations", "")
-        doc = Document(
-            page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
-            metadata={
-                "title": hit.source["title"],
-                "source": "Candid Learning",
-                "source_id": hit.source["post_id"],
-                "url": hit.source.get("url", "")
-            }
-        )
     elif "candid-help" in hit.index:
-        title = hit.source.get("title", "")
-        content_with_context_txt = get_context("content", hit, context_length=12)
-        combined_article_description = hit.source.get("combined_article_description", "")
-        doc = Document(
-            page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
-            metadata={
-                "title": title,
-                "source": "Candid Help",
-                "source_id": hit.source["id"],
-                "url": hit.source.get("link", "")
-            }
-        )
     elif "news" in hit.index:
-        doc = Document(
-            page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
-            metadata={
-                "title": hit.source.get("title", ""),
-                "source": "Candid News",
-                "source_id": hit.source["id"],
-                "url": hit.source.get("link", "")
-            }
-        )
     else:
         doc = None
     return doc

 from typing import List, Tuple, Dict, Iterable, Iterator, Optional, Union, Any
 from itertools import groupby
 from torch.nn import functional as F
 from elasticsearch import Elasticsearch
 from ask_candid.retrieval.sparse_lexical import SpladeEncoder
+from ask_candid.retrieval.sources.schema import ElasticHitsResult
+from ask_candid.retrieval.sources.issuelab import IssueLabConfig, process_issuelab_hit
+from ask_candid.retrieval.sources.youtube import YoutubeConfig, process_youtube_hit
+from ask_candid.retrieval.sources.candid_blog import CandidBlogConfig, process_blog_hit
+from ask_candid.retrieval.sources.candid_learning import CandidLearningConfig, process_learning_hit
+from ask_candid.retrieval.sources.candid_help import CandidHelpConfig, process_help_hit
+from ask_candid.retrieval.sources.candid_news import CandidNewsConfig, process_news_hit
 from ask_candid.services.small_lm import CandidSLM
 from ask_candid.base.config.connections import SEMANTIC_ELASTIC_QA, NEWS_ELASTIC
 from ask_candid.base.config.data import DataIndices, ALL_INDICES
 encoder = SpladeEncoder()
 class RetrieverInput(BaseModel):
     """Input to the Elasticsearch retriever."""
     user_input: str = Field(description="query to look up in retriever")
     tokens = encoder.token_expand(query)
     query = {
+        "_source": ["id", "link", "title", "content", "site_name"],
         "query": {
             "bool": {
                 "filter": [
         if index == "issuelab":
             q = build_sparse_vector_query(query=query, fields=IssueLabConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 2
             queries.extend([{"index": IssueLabConfig.index_name}, q])
         elif index == "youtube":
             q = build_sparse_vector_query(query=query, fields=YoutubeConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings", *YoutubeConfig.excluded_fields]}
+            q["size"] = 5
             queries.extend([{"index": YoutubeConfig.index_name}, q])
         elif index == "candid_blog":
             q = build_sparse_vector_query(query=query, fields=CandidBlogConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
             queries.extend([{"index": CandidBlogConfig.index_name}, q])
         elif index == "candid_learning":
             q = build_sparse_vector_query(query=query, fields=CandidLearningConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
             queries.extend([{"index": CandidLearningConfig.index_name}, q])
         elif index == "candid_help":
             q = build_sparse_vector_query(query=query, fields=CandidHelpConfig.text_fields)
             q["_source"] = {"excludes": ["embeddings"]}
+            q["size"] = 5
             queries.extend([{"index": CandidHelpConfig.index_name}, q])
         elif index == "news":
             q = news_query_builder(query=query)
     def _msearch_response_generator(responses: List[Dict[str, Any]]) -> Iterator[ElasticHitsResult]:
         for query_group in responses:
             for h in query_group.get("hits", {}).get("hits", []):
+                inner_hits = h.get("inner_hits", {})
+                if not inner_hits:
+                    if "news" in h.get("_index"):
+                        inner_hits = {"text": h.get("_source", {}).get("content")}
                 yield ElasticHitsResult(
                     index=h["_index"],
                     id=h["_id"],
                     score=h["_score"],
                     source=h["_source"],
+                    inner_hits=inner_hits
                 )
     results = []
     text = []
     for _, v in hits.items():
+        if _ == "text":
+            text.append(v)
+            continue
         for h in (v.get("hits", {}).get("hits") or []):
             for _, field in h.get("fields", {}).items():
                 for chunk in field:
 def reranker(
     query_results: Iterable[ElasticHitsResult],
+    search_text: Optional[str] = None,
+    max_num_results: int = 10
 ) -> Iterator[ElasticHitsResult]:
     """Reranks Elasticsearch hits coming from multiple indices/queries which may have scores on different scales.
     This will shuffle results
                 text = retrieved_text(d.inner_hits)
                 texts.append(text)
+    if search_text and len(texts) == len(results):
+        # scores = cosine_rescore(search_text, texts)
+        scores = encoder.query_reranking(query=search_text, documents=texts)
+        for r, s in zip(results, scores):
+            r.score = s
+    yield from sorted(results, key=lambda x: x.score, reverse=True)[:max_num_results]
 def process_hit(hit: ElasticHitsResult) -> Union[Document, None]:
     """
     if "issuelab-elser" in hit.index:
+        doc = process_issuelab_hit(hit)
     elif "youtube" in hit.index:
+        doc = process_youtube_hit(hit)
     elif "candid-blog" in hit.index:
+        doc = process_blog_hit(hit)
     elif "candid-learning" in hit.index:
+        doc = process_learning_hit(hit)
     elif "candid-help" in hit.index:
+        doc = process_help_hit(hit)
     elif "news" in hit.index:
+        doc = process_news_hit(hit)
     else:
         doc = None
     return doc

ask_candid/retrieval/sources/candid_blog.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from typing import Dict, Any
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 CandidBlogConfig = ElasticSourceConfig(
     index_name="search-semantic-candid-blog",
@@ -8,6 +11,24 @@ CandidBlogConfig = ElasticSourceConfig(
 )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['link']}"
     fields = ["title", "excerpt"]

 from typing import Dict, Any
+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
+from ask_candid.retrieval.sources.utils import get_context
 CandidBlogConfig = ElasticSourceConfig(
     index_name="search-semantic-candid-blog",
 )
+def process_blog_hit(hit: ElasticHitsResult) -> Document:
+    excerpt = hit.source.get("excerpt", "")
+    title = hit.source.get("title", "")
+    # we only need to process long text
+    content_with_context_txt = get_context("content", hit, context_length=12, add_context=False)
+    authors = get_context("authors_text", hit, context_length=12, add_context=False)
+    tags = hit.source.get("title_summary_tags", "")
+    return Document(
+        page_content='\n\n'.join([title, excerpt, content_with_context_txt, authors, tags]),
+        metadata={
+            "title": title,
+            "source": "Candid Blog",
+            "source_id": hit.source["id"],
+            "url": hit.source["link"]
+        }
+    )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['link']}"
     fields = ["title", "excerpt"]

ask_candid/retrieval/sources/candid_help.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from typing import Dict, Any
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 CandidHelpConfig = ElasticSourceConfig(
     index_name="search-semantic-candid-help-elser_ve1",
@@ -8,6 +11,22 @@ CandidHelpConfig = ElasticSourceConfig(
 )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['link']}"
     fields = ["title", "summary"]

 from typing import Dict, Any
+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
+from ask_candid.retrieval.sources.utils import get_context
 CandidHelpConfig = ElasticSourceConfig(
     index_name="search-semantic-candid-help-elser_ve1",
 )
+def process_help_hit(hit: ElasticHitsResult) -> Document:
+    title = hit.source.get("title", "")
+    content_with_context_txt = get_context("content", hit, context_length=12)
+    combined_article_description = hit.source.get("combined_article_description", "")
+    return Document(
+        page_content='\n\n'.join([combined_article_description, content_with_context_txt]),
+        metadata={
+            "title": title,
+            "source": "Candid Help",
+            "source_id": hit.source["id"],
+            "url": hit.source.get("link", "")
+        }
+    )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['link']}"
     fields = ["title", "summary"]

ask_candid/retrieval/sources/candid_learning.py CHANGED Viewed

@@ -1,5 +1,9 @@
 from typing import Dict, Any
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 CandidLearningConfig = ElasticSourceConfig(
@@ -8,6 +12,23 @@ CandidLearningConfig = ElasticSourceConfig(
 )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['url']}"
     fields = ["title", "excerpt"]

 from typing import Dict, Any
+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
+from ask_candid.retrieval.sources.utils import get_context
 CandidLearningConfig = ElasticSourceConfig(
 )
+def process_learning_hit(hit: ElasticHitsResult) -> Document:
+    title = hit.source.get("title", "")
+    content_with_context_txt = get_context("content", hit, context_length=12)
+    training_topics = hit.source.get("training_topics", "")
+    staff_recommendations = hit.source.get("staff_recommendations", "")
+    return Document(
+        page_content='\n\n'.join([title, staff_recommendations, training_topics, content_with_context_txt]),
+        metadata={
+            "title": hit.source["title"],
+            "source": "Candid Learning",
+            "source_id": hit.source["post_id"],
+            "url": hit.source.get("url", "")
+        }
+    )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"{doc['url']}"
     fields = ["title", "excerpt"]

ask_candid/retrieval/sources/candid_news.py CHANGED Viewed

@@ -1,7 +1,20 @@
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 CandidNewsConfig = ElasticSourceConfig(
     index_name="news_1",
     text_fields=("title", "content")
 )

+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
 CandidNewsConfig = ElasticSourceConfig(
     index_name="news_1",
     text_fields=("title", "content")
 )
+def process_news_hit(hit: ElasticHitsResult) -> Document:
+    return Document(
+        page_content='\n\n'.join([hit.source.get("title", ""), hit.source.get("content", "")]),
+        metadata={
+            "title": hit.source.get("title", ""),
+            "source": hit.source.get("site_name") or "Candid News",
+            "source_id": hit.source["id"],
+            "url": hit.source.get("link", "")
+        }
+    )

ask_candid/retrieval/sources/issuelab.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from typing import Dict, Any
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 IssueLabConfig = ElasticSourceConfig(
     index_name="search-semantic-issuelab-elser_ve2",
@@ -8,11 +11,33 @@ IssueLabConfig = ElasticSourceConfig(
 )
 def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     chunks_html = ""
     if show_chunks:
         cleaned_text = []
-        for k, v in doc["inner_hits"].items():
             hits = v["hits"]["hits"]
             for h in hits:
                 for k1, v1 in h["fields"].items():

 from typing import Dict, Any
+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
+from ask_candid.retrieval.sources.utils import get_context
 IssueLabConfig = ElasticSourceConfig(
     index_name="search-semantic-issuelab-elser_ve2",
 )
+def process_issuelab_hit(hit: ElasticHitsResult) -> Document:
+    combined_item_description = hit.source.get("combined_item_description", "") # title inside
+    description = hit.source.get("description", "")
+    combined_issuelab_findings = hit.source.get("combined_issuelab_findings", "")
+    # we only need to process long texts
+    chunks_with_context_txt = get_context("content", hit, context_length=12)
+    return Document(
+        page_content='\n\n'.join([
+            combined_item_description,
+            combined_issuelab_findings,
+            description,
+            chunks_with_context_txt
+        ]),
+        metadata={
+            "title": hit.source["title"],
+            "source": "IssueLab",
+            "source_id": hit.source["resource_id"],
+            "url": hit.source.get("permalink", "")
+        }
+    )
 def issuelab_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     chunks_html = ""
     if show_chunks:
         cleaned_text = []
+        for _, v in doc["inner_hits"].items():
             hits = v["hits"]["hits"]
             for h in hits:
                 for k1, v1 in h["fields"].items():

ask_candid/retrieval/sources/schema.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Tuple, Optional
 from dataclasses import dataclass, field
@@ -7,3 +7,14 @@ class ElasticSourceConfig:
     index_name: str
     text_fields: Tuple[str]
     excluded_fields: Optional[Tuple[str]] = field(default_factory=tuple)

+from typing import Tuple, Dict, Optional, Any
 from dataclasses import dataclass, field
     index_name: str
     text_fields: Tuple[str]
     excluded_fields: Optional[Tuple[str]] = field(default_factory=tuple)
+@dataclass
+class ElasticHitsResult:
+    """Dataclass for Elasticsearch hits results
+    """
+    index: str
+    id: Any
+    score: float
+    source: Dict[str, Any]
+    inner_hits: Dict[str, Any]

ask_candid/retrieval/sources/utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from ask_candid.retrieval.sources.schema import ElasticHitsResult
+def get_context(field_name: str, hit: ElasticHitsResult, context_length: int = 1024, add_context: bool = True) -> str:
+    """Pads the relevant chunk of text with context before and after
+    Parameters
+    ----------
+    field_name : str
+        a field with the long text that was chunked into pieces
+    hit : ElasticHitsResult
+    context_length : int, optional
+        length of text to add before and after the chunk, by default 1024
+    Returns
+    -------
+    str
+        longer chunks stuffed together
+    """
+    chunks = []
+    # NOTE chunks have tokens, long text is a normal text, but may contain html that also gets weird after tokenization
+    long_text = hit.source.get(f"{field_name}", "")
+    long_text = long_text.lower()
+    inner_hits_field = f"embeddings.{field_name}.chunks"
+    found_chunks = hit.inner_hits.get(inner_hits_field, {})
+    if found_chunks:
+        hits = found_chunks.get("hits", {}).get("hits", [])
+        for h in hits:
+            chunk = h.get("fields", {})[inner_hits_field][0]["chunk"][0]
+            # cutting the middle because we may have tokenizing artifacts there
+            chunk = chunk[3: -3]
+            if add_context:
+                # Find the start and end indices of the chunk in the large text
+                start_index = long_text.find(chunk[:20])
+                # Chunk is found
+                if start_index != -1:
+                    end_index = start_index + len(chunk)
+                    pre_start_index = max(0, start_index - context_length)
+                    post_end_index = min(len(long_text), end_index + context_length)
+                    chunks.append(long_text[pre_start_index:post_end_index])
+            else:
+                chunks.append(chunk)
+    return '\n\n'.join(chunks)

ask_candid/retrieval/sources/youtube.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from typing import Dict, Any
-from ask_candid.retrieval.sources.schema import ElasticSourceConfig
 YoutubeConfig = ElasticSourceConfig(
     index_name="search-semantic-youtube-elser_ve1",
@@ -9,6 +12,22 @@ YoutubeConfig = ElasticSourceConfig(
 )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"https://www.youtube.com/watch?v={doc['video_id']}"
     fields = ["title", "description_cleaned"]

 from typing import Dict, Any
+from langchain_core.documents import Document
+from ask_candid.retrieval.sources.schema import ElasticSourceConfig, ElasticHitsResult
+from ask_candid.retrieval.sources.utils import get_context
 YoutubeConfig = ElasticSourceConfig(
     index_name="search-semantic-youtube-elser_ve1",
 )
+def process_youtube_hit(hit: ElasticHitsResult) -> Document:
+    title = hit.source.get("title", "")
+    # we only need to process long texts
+    description_cleaned_with_context_txt = get_context("description_cleaned", hit, context_length=12)
+    captions_cleaned_with_context_txt = get_context("captions_cleaned", hit, context_length=12)
+    return Document(
+        page_content='\n\n'.join([title, description_cleaned_with_context_txt, captions_cleaned_with_context_txt]),
+        metadata={
+            "title": title,
+            "source": "Candid YouTube",
+            "source_id": hit.source['video_id'],
+            "url": f"https://www.youtube.com/watch?v&#61;{hit.source['video_id']}"
+        }
+    )
 def build_card_html(doc: Dict[str, Any], height_px: int = 200, show_chunks=False) -> str:
     url = f"https://www.youtube.com/watch?v={doc['video_id']}"
     fields = ["title", "description_cleaned"]

ask_candid/retrieval/sparse_lexical.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from typing import Dict
 from transformers import AutoModelForMaskedLM, AutoTokenizer
 import torch
@@ -14,14 +15,23 @@ class SpladeEncoder:
         self.idx2token = {idx: token for token, idx in self.tokenizer.get_vocab().items()}
     @torch.no_grad()
-    def token_expand(self, query: str) -> Dict[str, float]:
-        tokens = self.tokenizer(query, return_tensors='pt')
         output = self.model(**tokens)
         vec = torch.max(
             torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1),
             dim=1
         )[0].squeeze()
         cols = vec.nonzero().squeeze().cpu().tolist()
         weights = vec[cols].cpu().tolist()

+from typing import List, Dict
 from transformers import AutoModelForMaskedLM, AutoTokenizer
+from torch.nn import functional as F
 import torch
         self.idx2token = {idx: token for token, idx in self.tokenizer.get_vocab().items()}
     @torch.no_grad()
+    def forward(self, texts: List[str]):
+        tokens = self.tokenizer(texts, return_tensors='pt', truncation=True, padding=True)
         output = self.model(**tokens)
         vec = torch.max(
             torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1),
             dim=1
         )[0].squeeze()
+        return vec
+    def query_reranking(self, query: str, documents: List[str]):
+        vec = self.forward([query, *documents])
+        xQ = F.normalize(vec[:1], dim=-1, p=2.)
+        xD = F.normalize(vec[1:], dim=-1, p=2.)
+        return (xQ * xD).sum(dim=-1).cpu().tolist()
+    def token_expand(self, query: str) -> Dict[str, float]:
+        vec = self.forward([query])
         cols = vec.nonzero().squeeze().cpu().tolist()
         weights = vec[cols].cpu().tolist()

ask_candid/tools/elastic/index_search_tool.py CHANGED Viewed

@@ -40,6 +40,7 @@ class SearchToolInput(BaseModel):
 def elastic_search(
     index_name: str,
     query: str,
     from_: int = 0,
@@ -107,9 +108,15 @@ def elastic_search(
         return msg
-def create_search_tool():
     return StructuredTool.from_function(
-        elastic_search,
         name="elastic_index_search_tool",
         description=(
             """This tool allows executing queries on an Elasticsearch index efficiently. Provide:

 def elastic_search(
+    pcs_codes: dict,
     index_name: str,
     query: str,
     from_: int = 0,
         return msg
+def create_search_tool(pcs_codes):
     return StructuredTool.from_function(
+        func=lambda index_name, query, from_, size: elastic_search(
+            pcs_codes=pcs_codes,
+            index_name=index_name,
+            query=query,
+            from_=from_,
+            size=size,
+        ),
         name="elastic_index_search_tool",
         description=(
             """This tool allows executing queries on an Elasticsearch index efficiently. Provide:

ask_candid/tools/question_reformulation.py CHANGED Viewed

@@ -1,55 +1,55 @@
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-def reformulate_question_using_history(state, llm, focus_on_recommendations=False):
-    """
-    Transform the query to produce a better query with details from previous messages and emphasize
-    aspects important for recommendations if needed.
-    Args:
-        state (dict): The current state containing messages.
-        llm: LLM to use for generating the reformulation.
-        focus_on_recommendations (bool): Flag to determine if the reformulation should emphasize
-                                         recommendation-relevant aspects such as geographies,
-                                         cause areas, etc.
-    Returns:
-        dict: The updated state with re-phrased question and original user_input for UI
     """
     print("---REFORMULATE THE USER INPUT---")
     messages = state["messages"]
     question = messages[-1].content
-    if len(messages) > 1:
         if focus_on_recommendations:
-            prompt_text = """Given a chat history and the latest user input \
-                which might reference context in the chat history, \
-                especially geographic locations, cause areas and/or population groups, \
-                formulate a standalone input which can be understood without the chat history.
-            Chat history:
-            \n ------- \n
-            {chat_history}
-            \n ------- \n
-            User input:
-            \n ------- \n
-            {question}
-            \n ------- \n
             Reformulate the question without adding implications or assumptions about the user's needs or intentions.
             Focus solely on clarifying any contextual details present in the original input."""
         else:
-            prompt_text = """Given a chat history and the latest user input \
-            which might reference context in the chat history, formulate a standalone input \
-            which can be understood without the chat history.
-            Chat history:
-            \n ------- \n
-            {chat_history}
-            \n ------- \n
-            User input:
-            \n ------- \n
-            {question}
-            \n ------- \n
-            Do NOT answer the question, \
-            just reformulate it if needed and otherwise return it as is.
             """
         contextualize_q_prompt = ChatPromptTemplate([
@@ -58,7 +58,11 @@ def reformulate_question_using_history(state, llm, focus_on_recommendations=Fals
         ])
         rag_chain = contextualize_q_prompt | llm | StrOutputParser()
-        new_question = rag_chain.invoke({"chat_history": messages, "question": question})
         print(f"user asked: '{question}', agent reformulated the question basing on the chat history: {new_question}")
         return {"messages": [new_question], "user_input" : question}
     return {"messages": [question], "user_input" : question}

 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
+from langchain_core.language_models.llms import LLM
+from ask_candid.agents.schema import AgentState
+def reformulate_question_using_history(
+    state: AgentState,
+    llm: LLM,
+    focus_on_recommendations: bool = False
+) -> AgentState:
+    """Transform the query to produce a better query with details from previous messages and emphasize aspects important
+    for recommendations if needed.
+    Parameters
+    ----------
+    state : AgentState
+        The current state
+    llm : LLM
+    focus_on_recommendations : bool, optional
+        Flag to determine if the reformulation should emphasize recommendation-relevant aspects such as geographies,
+        cause areas, etc., by default False
+    Returns
+    -------
+    AgentState
+        The updated state
     """
     print("---REFORMULATE THE USER INPUT---")
     messages = state["messages"]
     question = messages[-1].content
+    if len(messages[:-1]) > 1:  # need to skip the system message
         if focus_on_recommendations:
+            prompt_text = """Given a chat history and the latest user input which might reference context in the chat
+            history, especially geographic locations, cause areas and/or population groups, formulate a standalone input
+            which can be understood without the chat history.
+            Chat history: ```{chat_history}```
+            User input: ```{question}```
             Reformulate the question without adding implications or assumptions about the user's needs or intentions.
             Focus solely on clarifying any contextual details present in the original input."""
         else:
+            prompt_text = """Given a chat history and the latest user input which might reference context in the chat
+            history, formulate a standalone input which can be understood without the chat history. Include hints as to
+            what the user is getting at given the context in the chat history.
+            Chat history: ```{chat_history}```
+            User input: ```{question}```
+            Do NOT answer the question, just reformulate it if needed and otherwise return it as is.
             """
         contextualize_q_prompt = ChatPromptTemplate([
         ])
         rag_chain = contextualize_q_prompt | llm | StrOutputParser()
+        # new_question = rag_chain.invoke({"chat_history": messages, "question": question})
+        new_question = rag_chain.invoke({
+            "chat_history": '\n'.join(f"{m.type.upper()}: {m.content}" for m in messages[1:]),
+            "question": question
+        })
         print(f"user asked: '{question}', agent reformulated the question basing on the chat history: {new_question}")
         return {"messages": [new_question], "user_input" : question}
     return {"messages": [question], "user_input" : question}