kind: uapf.algorithm.card

id: algo.semantic_document_analysis.pii_redactor
version: "1.0.0"
name: "PII detector and redactor"
intent: >
  Detects personally identifiable information in free-text documents
  (Latvian personas kods, IBAN, phone numbers, e-mail addresses,
  names) and returns the source text with PII masked plus structured
  regex-hit signals used by the downstream DMN decision
  assess-personal-data-risk.

algorithm_kind: redactor

io:
  inputs:
    - id: content
      type: string
      cardinality: single
      constraints:
        maxLength: 200000
      documentation: "Raw document text submitted for semantic analysis."
  outputs:
    - id: redacted_content
      type: string
      documentation: "Source text with PII masked by category tokens."
    - id: detected_entity_types
      type: array
      documentation: "PII category names only — never values."
    - id: personas_koda_present
      type: boolean
    - id: financial_data_present
      type: boolean
    - id: contact_data_present
      type: boolean
    - id: pii_category_count
      type: integer
      constraints: { minimum: 0 }

implementation:
  type: external
  medium: mcp_tool
  uri: "uapf-ip://capability/ai.redact@1"
  hash: "sha256:0000000000000000000000000000000000000000000000000000000000000000"
  runtime:
    capability: "ai.redact@1"
    note: "Host-fulfilled UAPF-IP capability. Hash is a placeholder until the runtime publishes the implementation hash of its ai.redact handler."

determinism: deterministic
side_effects: pure

complexity:
  typical_latency_ms: 250
  max_latency_ms: 10000

failure_mode: "throw — refuse processing if redactor unavailable; PII risk dominates."

limitations:
  - "Latviešu valodas personu vārdi atpazīstami ~92% gadījumu"
  - "Pieņem, ka teksts jau ir digitāls — OCR nav iekļauta"

reference:
  legal: "GDPR 2016/679 5. pants (datu minimizēšana); Fizisko personu datu apstrādes likums."
  standard: "NIST SP 800-188 — De-Identification of Personal Information."

owners:
  - type: role
    id: data_protection_officer
    contact: stewards@uapf.dev

lifecycle:
  status: draft
  since: "2026-05-20"

audit:
  log_inputs: redacted
  log_outputs: full
  retention: "7y"

privacy:
  processesPII: true
  technique: pseudonymization
  reidentificationRisk: low

risk:
  aiActRiskClass: limited
  humanOversight: advisory