|
import os |
|
|
|
import instructor |
|
from pydantic import BaseModel |
|
from presidio_analyzer import AnalyzerEngine |
|
from presidio_anonymizer import AnonymizerEngine |
|
""" |
|
from openai import OpenAI |
|
client = instructor.from_openai( |
|
OpenAI( |
|
base_url="http://localhost:11434/v1", |
|
api_key="ollama", |
|
), |
|
mode=instructor.Mode.JSON, |
|
) |
|
""" |
|
from groq import Groq |
|
|
|
client = Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
|
|
|
|
client = instructor.from_groq(client) |
|
|
|
llm = 'llama-3.1-8b-instant' if os.getenv("GROQ_API_KEY") else "llama3.2" |
|
|
|
|
|
class PIIData(BaseModel): |
|
index: int |
|
data_type: str |
|
pii_value: str |
|
|
|
|
|
class PIIExtraction(BaseModel): |
|
""" |
|
Extracted PII data from a document, all data_types should try to have consistent property names |
|
""" |
|
private_data: list[PIIData] |
|
chain_of_thought: str |
|
|
|
def sanitize(self, content): |
|
""" |
|
Iterates over the private data and replaces the value with a placeholder in the form of |
|
<{data_type}_{i}> |
|
""" |
|
|
|
for i, data in enumerate(self.private_data): |
|
content = content.replace(data.pii_value, f"<{data.data_type}_{i}>") |
|
|
|
presidio_analyzer = AnalyzerEngine() |
|
presidio_anonymizer = AnonymizerEngine() |
|
analysis = presidio_analyzer.analyze(content, language='en', |
|
entities=["PERSON", "PHONE_NUMBER"]) |
|
|
|
if [entity.entity_type for entity in analysis]: |
|
content = "GUARDRAILED: " + presidio_anonymizer.anonymize(text=content, analyzer_results=analysis).text |
|
return content |
|
|
|
|
|
def derisk(content) -> PIIExtraction: |
|
return client.chat.completions.create( |
|
model=llm, |
|
response_model=PIIExtraction, |
|
temperature=0.2, |
|
messages=[ |
|
{ |
|
"role": "system", |
|
"content": "You are a world class international PII scrubbing model, perform data preprocess include standardization, stop word removal, punctuation removal...to enhance signal to noise ratio for name, phone, address, email, id...etc. Extract the PII data from the following document", |
|
|
|
}, { |
|
"role": "user", |
|
"content": {content}, |
|
} |
|
]).sanitize(content) |
|
|
|
|
|
if __name__ == '__main__': |
|
ESSAY = """ |
|
He Hua (Hua Hua) Director |
|
hehua@chengdu.com |
|
+86-28-83505513 |
|
|
|
Alternative Address Format: |
|
Xiongmao Ave West Section, Jinniu District (listed in some records as 610016 postcode) |
|
|
|
|
|
Best Viewing: Before 9:00 AM during summer hours (7:30 AM-5:00 PM) |
|
|
|
Caretaker: Tan Jintao ("Grandpa Tan") |
|
|
|
Additional Contacts |
|
Charitable Donations: +86-28-83505513 |
|
Dining Reservations: +86-17311072681 |
|
""" |
|
|
|
print(derisk(ESSAY)) |
|
|
|
|
|
|