Module pii_codex.models.analysis
Expand source code
# pylint: disable=too-many-instance-attributes
from __future__ import annotations
from dataclasses import dataclass, field
from typing import List, Counter, Optional
from pii_codex.models.common import RiskLevel, RiskLevelDefinition
# PII detection, risk assessment, and analysis models
@dataclass
class RiskAssessment:
"""
Singular risk assessment for a string token
"""
pii_type_detected: Optional[str] = None
risk_level: int = RiskLevel.LEVEL_ONE.value
risk_level_definition: str = (
RiskLevelDefinition.LEVEL_ONE.value
) # Default if it's not semi or fully identifiable
cluster_membership_type: Optional[str] = None
hipaa_category: Optional[str] = None
dhs_category: Optional[str] = None
nist_category: Optional[str] = None
@dataclass
class RiskAssessmentList:
"""
Risk Assessments and the average risk score of all list items
"""
risk_assessments: List[RiskAssessment]
average_risk_score: float
@dataclass
class DetectionResultItem:
"""
Type associated with a singular PII detection (e.g. detection of an email in a string), its associated risk score,
and where it is located in a string.
"""
entity_type: str
score: float = 0.0 # metadata detections don't have confidence score values
start: int = 0 # metadata detections don't have offset values
end: int = 0 # metadata detections don't have offset values
@dataclass
class DetectionResult:
"""
List of detection results and the index of the string they pertain to
"""
detections: List[DetectionResultItem]
index: int = 0
@dataclass
class AnalysisResultItem:
"""
The results associated to a single detection of a single string (e.g. Social Media Post, SMS, etc.)
"""
detection: Optional[DetectionResultItem]
risk_assessment: RiskAssessment
def to_dict(self):
return {
"riskAssessment": self.risk_assessment.__dict__,
"detection": self.detection.__dict__,
}
def to_flattened_dict(self):
assessment = self.risk_assessment.__dict__.copy()
if self.detection:
assessment.update(self.detection.__dict__)
return assessment
@dataclass
class AnalysisResult:
"""
The analysis results associated with several detections within a single string (e.g. Social Media Post, SMS, etc.)
"""
analysis: List[AnalysisResultItem]
index: int = 0
risk_score_mean: float = 0.0
sanitized_text: str = ""
def to_dict(self):
return {
"analysis": [item.to_flattened_dict() for item in self.analysis],
"index": self.index,
"risk_score_mean": self.risk_score_mean,
"sanitized_text": self.sanitized_text,
}
def get_detected_types(self) -> List[str]:
return [pii.detection.entity_type for pii in self.analysis if pii.detection]
@dataclass
class AnalysisResultSet:
"""
The analysis results associated with a collection of strings or documents (e.g. Social Media Posts, forum thread,
etc.). Includes most/least detected PII types within the collection, average risk score of analyses,
"""
analyses: List[AnalysisResult]
detection_count: int = 0
detected_pii_types: set[str] = field(default_factory=set)
detected_pii_type_frequencies: Counter = None # type: ignore
risk_scores: List[float] = field(default_factory=list)
risk_score_mean: float = 1.0 # Default is 1 for non-identifiable
risk_score_mode: float = 0.0
risk_score_median: float = 0.0
risk_score_standard_deviation: float = 0.0
risk_score_variance: float = 0.0
collection_name: Optional[
str
] = None # Optional ability for analysts to name a set (see analysis storage step in notebooks)
collection_type: str = "POPULATION" # Other option is SAMPLE
def to_dict(self):
return {
"collection_name": self.collection_name,
"collection_type": self.collection_type,
"analyses": [item.to_dict() for item in self.analyses],
"detection_count": self.detection_count,
"risk_scores": self.risk_scores,
"risk_score_mean": self.risk_score_mean,
"risk_score_mode": self.risk_score_mode,
"risk_score_median": self.risk_score_median,
"risk_score_standard_deviation": self.risk_score_standard_deviation,
"risk_score_variance": self.risk_score_variance,
"detected_pii_types": self.detected_pii_types,
"detected_pii_type_frequencies": dict(self.detected_pii_type_frequencies),
}
Classes
class AnalysisResult (analysis: List[AnalysisResultItem], index: int = 0, risk_score_mean: float = 0.0, sanitized_text: str = '')-
The analysis results associated with several detections within a single string (e.g. Social Media Post, SMS, etc.)
Expand source code
@dataclass class AnalysisResult: """ The analysis results associated with several detections within a single string (e.g. Social Media Post, SMS, etc.) """ analysis: List[AnalysisResultItem] index: int = 0 risk_score_mean: float = 0.0 sanitized_text: str = "" def to_dict(self): return { "analysis": [item.to_flattened_dict() for item in self.analysis], "index": self.index, "risk_score_mean": self.risk_score_mean, "sanitized_text": self.sanitized_text, } def get_detected_types(self) -> List[str]: return [pii.detection.entity_type for pii in self.analysis if pii.detection]Class variables
var analysis : List[AnalysisResultItem]var index : intvar risk_score_mean : floatvar sanitized_text : str
Methods
def get_detected_types(self) ‑> List[str]-
Expand source code
def get_detected_types(self) -> List[str]: return [pii.detection.entity_type for pii in self.analysis if pii.detection] def to_dict(self)-
Expand source code
def to_dict(self): return { "analysis": [item.to_flattened_dict() for item in self.analysis], "index": self.index, "risk_score_mean": self.risk_score_mean, "sanitized_text": self.sanitized_text, }
class AnalysisResultItem (detection: Optional[DetectionResultItem], risk_assessment: RiskAssessment)-
The results associated to a single detection of a single string (e.g. Social Media Post, SMS, etc.)
Expand source code
@dataclass class AnalysisResultItem: """ The results associated to a single detection of a single string (e.g. Social Media Post, SMS, etc.) """ detection: Optional[DetectionResultItem] risk_assessment: RiskAssessment def to_dict(self): return { "riskAssessment": self.risk_assessment.__dict__, "detection": self.detection.__dict__, } def to_flattened_dict(self): assessment = self.risk_assessment.__dict__.copy() if self.detection: assessment.update(self.detection.__dict__) return assessmentClass variables
var detection : Optional[DetectionResultItem]var risk_assessment : RiskAssessment
Methods
def to_dict(self)-
Expand source code
def to_dict(self): return { "riskAssessment": self.risk_assessment.__dict__, "detection": self.detection.__dict__, } def to_flattened_dict(self)-
Expand source code
def to_flattened_dict(self): assessment = self.risk_assessment.__dict__.copy() if self.detection: assessment.update(self.detection.__dict__) return assessment
class AnalysisResultSet (analyses: List[AnalysisResult], detection_count: int = 0, detected_pii_types: set[str] = <factory>, detected_pii_type_frequencies: Counter = None, risk_scores: List[float] = <factory>, risk_score_mean: float = 1.0, risk_score_mode: float = 0.0, risk_score_median: float = 0.0, risk_score_standard_deviation: float = 0.0, risk_score_variance: float = 0.0, collection_name: Optional[str] = None, collection_type: str = 'POPULATION')-
The analysis results associated with a collection of strings or documents (e.g. Social Media Posts, forum thread, etc.). Includes most/least detected PII types within the collection, average risk score of analyses,
Expand source code
@dataclass class AnalysisResultSet: """ The analysis results associated with a collection of strings or documents (e.g. Social Media Posts, forum thread, etc.). Includes most/least detected PII types within the collection, average risk score of analyses, """ analyses: List[AnalysisResult] detection_count: int = 0 detected_pii_types: set[str] = field(default_factory=set) detected_pii_type_frequencies: Counter = None # type: ignore risk_scores: List[float] = field(default_factory=list) risk_score_mean: float = 1.0 # Default is 1 for non-identifiable risk_score_mode: float = 0.0 risk_score_median: float = 0.0 risk_score_standard_deviation: float = 0.0 risk_score_variance: float = 0.0 collection_name: Optional[ str ] = None # Optional ability for analysts to name a set (see analysis storage step in notebooks) collection_type: str = "POPULATION" # Other option is SAMPLE def to_dict(self): return { "collection_name": self.collection_name, "collection_type": self.collection_type, "analyses": [item.to_dict() for item in self.analyses], "detection_count": self.detection_count, "risk_scores": self.risk_scores, "risk_score_mean": self.risk_score_mean, "risk_score_mode": self.risk_score_mode, "risk_score_median": self.risk_score_median, "risk_score_standard_deviation": self.risk_score_standard_deviation, "risk_score_variance": self.risk_score_variance, "detected_pii_types": self.detected_pii_types, "detected_pii_type_frequencies": dict(self.detected_pii_type_frequencies), }Class variables
var analyses : List[AnalysisResult]var collection_name : Optional[str]var collection_type : strvar detected_pii_type_frequencies : Countervar detected_pii_types : set[str]var detection_count : intvar risk_score_mean : floatvar risk_score_median : floatvar risk_score_mode : floatvar risk_score_standard_deviation : floatvar risk_score_variance : floatvar risk_scores : List[float]
Methods
def to_dict(self)-
Expand source code
def to_dict(self): return { "collection_name": self.collection_name, "collection_type": self.collection_type, "analyses": [item.to_dict() for item in self.analyses], "detection_count": self.detection_count, "risk_scores": self.risk_scores, "risk_score_mean": self.risk_score_mean, "risk_score_mode": self.risk_score_mode, "risk_score_median": self.risk_score_median, "risk_score_standard_deviation": self.risk_score_standard_deviation, "risk_score_variance": self.risk_score_variance, "detected_pii_types": self.detected_pii_types, "detected_pii_type_frequencies": dict(self.detected_pii_type_frequencies), }
class DetectionResult (detections: List[DetectionResultItem], index: int = 0)-
List of detection results and the index of the string they pertain to
Expand source code
@dataclass class DetectionResult: """ List of detection results and the index of the string they pertain to """ detections: List[DetectionResultItem] index: int = 0Class variables
var detections : List[DetectionResultItem]var index : int
class DetectionResultItem (entity_type: str, score: float = 0.0, start: int = 0, end: int = 0)-
Type associated with a singular PII detection (e.g. detection of an email in a string), its associated risk score, and where it is located in a string.
Expand source code
@dataclass class DetectionResultItem: """ Type associated with a singular PII detection (e.g. detection of an email in a string), its associated risk score, and where it is located in a string. """ entity_type: str score: float = 0.0 # metadata detections don't have confidence score values start: int = 0 # metadata detections don't have offset values end: int = 0 # metadata detections don't have offset valuesClass variables
var end : intvar entity_type : strvar score : floatvar start : int
class RiskAssessment (pii_type_detected: Optional[str] = None, risk_level: int = 1, risk_level_definition: str = 'Non-Identifiable', cluster_membership_type: Optional[str] = None, hipaa_category: Optional[str] = None, dhs_category: Optional[str] = None, nist_category: Optional[str] = None)-
Singular risk assessment for a string token
Expand source code
@dataclass class RiskAssessment: """ Singular risk assessment for a string token """ pii_type_detected: Optional[str] = None risk_level: int = RiskLevel.LEVEL_ONE.value risk_level_definition: str = ( RiskLevelDefinition.LEVEL_ONE.value ) # Default if it's not semi or fully identifiable cluster_membership_type: Optional[str] = None hipaa_category: Optional[str] = None dhs_category: Optional[str] = None nist_category: Optional[str] = NoneClass variables
var cluster_membership_type : Optional[str]var dhs_category : Optional[str]var hipaa_category : Optional[str]var nist_category : Optional[str]var pii_type_detected : Optional[str]var risk_level : intvar risk_level_definition : str
class RiskAssessmentList (risk_assessments: List[RiskAssessment], average_risk_score: float)-
Risk Assessments and the average risk score of all list items
Expand source code
@dataclass class RiskAssessmentList: """ Risk Assessments and the average risk score of all list items """ risk_assessments: List[RiskAssessment] average_risk_score: floatClass variables
var average_risk_score : floatvar risk_assessments : List[RiskAssessment]