File size: 1,580 Bytes
a0e37e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c751e97
a0e37e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c751e97
a0e37e2
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from typing import List, Optional
from dataclasses import dataclass
from enum import Enum

import torch

from ask_candid.base.lambda_base import LambdaInvokeBase


@dataclass(slots=True)
class Encoding:
    inputs: List[str]
    vectors: torch.Tensor


class CandidSLM(LambdaInvokeBase):
    """Wrapper around Candid's custom small language model.
    For more details see https://dev.azure.com/guidestar/DataScience/_git/graph-ai?path=/releases/language.
    This services includes:
        * text encoding
        * document summarization
        * entity salience estimation

    Parameters
    ----------
    access_key : Optional[str], optional
        AWS access key, by default None
    secret_key : Optional[str], optional
        AWS secret key, by default None
    """

    class Tasks(Enum):  # pylint: disable=missing-class-docstring
        ENCODE = "/encode"
        DOCUMENT_SUMMARIZE = "/document/summarize"
        DOCUMENT_NER_SALIENCE = "/document/entitySalience"

    def __init__(
        self, access_key: Optional[str] = None, secret_key: Optional[str] = None
    ) -> None:
        super().__init__(
            function_name="small-lm",
            access_key=access_key,
            secret_key=secret_key
        )

    def encode(self, text: List[str]) -> Encoding:
        response = self._submit_request({"text": text, "path": self.Tasks.ENCODE.value})

        output = Encoding(
            inputs=(response.get("inputs") or []),
            vectors=torch.tensor((response.get("vectors") or []), dtype=torch.float32)
        )
        return output