File size: 2,759 Bytes
1587277
 
 
 
b06e07d
 
1587277
b06e07d
1587277
 
 
 
 
 
 
 
8d7a1e9
 
 
 
 
 
 
1587277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d7a1e9
1587277
 
 
 
 
 
 
 
b06e07d
 
 
 
 
 
 
1587277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06e07d
1587277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d7a1e9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os

import instructor
from pydantic import BaseModel
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
"""
from openai import OpenAI
client = instructor.from_openai(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",
    ),
    mode=instructor.Mode.JSON,
)
"""
from groq import Groq
# Initialize with API key
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# Enable instructor patches for Groq client
client = instructor.from_groq(client)

llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2"


class PIIData(BaseModel):
  index: int
  data_type: str
  pii_value: str


class PIIExtraction(BaseModel):
  """
  Extracted PII data from a document, all data_types should try to have consistent property names
  """
  private_data: list[PIIData]
  chain_of_thought: str

  def sanitize(self, content):
    """
    Iterates over the private data and replaces the value with a placeholder in the form of
    <{data_type}_{i}>
    """

    for i, data in enumerate(self.private_data):
      content = content.replace(data.pii_value, f"<{data.data_type}_{i}>")

    presidio_analyzer = AnalyzerEngine()
    presidio_anonymizer = AnonymizerEngine()
    analysis = presidio_analyzer.analyze(content, language='en',
          entities=["PERSON", "PHONE_NUMBER"])

    if [entity.entity_type for entity in analysis]:
      content = "GUARDRAILED: " + presidio_anonymizer.anonymize(text=content, analyzer_results=analysis).text
    return content


def derisk(content) -> PIIExtraction:
  return client.chat.completions.create(
    model=llm,
    response_model=PIIExtraction,
    temperature=0.2,
    messages=[
      {
        "role": "system",
        "content": "You are a world class international PII scrubbing model, perform data preprocess include standardization, stop word removal, punctuation removal...to enhance signal to noise ratio for name, phone, address, email, id...etc. Extract the PII data from the following document",

      }, {
        "role": "user",
        "content": {content},
      }
    ]).sanitize(content) #.model_dump_json(indent=2)


if __name__ == '__main__':
  ESSAY = """
    He Hua (Hua Hua) Director
    hehua@chengdu.com
    +86-28-83505513
    
    Alternative Address Format:
    Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode)
    
    
    Best Viewing: Before 9:00 AM during summer hours (7:30 AM-5:00 PM)
    
    Caretaker: Tan Jintao ("Grandpa Tan")
    
    Additional Contacts
    Charitable Donations: +86-28-83505513
    Dining Reservations: +86-17311072681
    """

  print(derisk(ESSAY))
  # print(pii_leak.model_dump_json(indent=2))
  # print(pii_leak.sanitize(ESSAY))