You've already forked dokumenta-semantiska-analize
Import UAPF package
Compare commits
3 Commits
v3.1.0-bpm
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 7e9cb63a4b | |||
| 9b3790c1fa | |||
| e97b9d7d40 |
12
README.md
12
README.md
@@ -98,3 +98,15 @@ Validates against UAPF v2.4.0 schemas at
|
||||
```bash
|
||||
python tools/uapf-cli/uapf.py validate /path/to/dokumenta-semantiska-analize
|
||||
```
|
||||
|
||||
## v3.2.0 (UAPF v2.5.0 alignment)
|
||||
|
||||
Tests are now **embedded in each algorithm card** under a top-level `tests:` array (minimum 2 entries per card). The old sidecar location `tests/algorithms/<card-id>.test.yaml` is **removed** per UAPF v2.5.0 — that location no longer applies to algorithm cards.
|
||||
|
||||
Embedded tests for this package:
|
||||
- `algo.semantic_document_analysis.pii_redactor` — 3 cases (Latvian personas kods inline, plain text with no PII, financial figures + IBAN)
|
||||
- `algo.semantic_document_analysis.vdvc_semantic_extractor` — 2 cases (regulatory complaint, non-regulatory thank-you), both with `ai_confidence_score` tolerance bands appropriate for a stochastic LLM extractor
|
||||
- `algo.semantic_document_analysis.completion_event_emitter` — 2 cases (success completion, failure completion)
|
||||
|
||||
The Algorithm Card viewer (UAPF v2.5.0 chapter 13.16, ProcessGit Preview tab) consumes these embedded tests as its primary interaction surface — sample browser for `external` cards, regex/FEEL/source-display for `inline` cards.
|
||||
|
||||
|
||||
@@ -1,17 +1,10 @@
|
||||
kind: uapf.algorithm.card
|
||||
|
||||
id: algo.semantic_document_analysis.completion_event_emitter
|
||||
version: "1.0.0"
|
||||
name: "Process completion event emitter"
|
||||
intent: >
|
||||
Publishes a CloudEvents 1.0-conformant event marking the completion
|
||||
of one semantic analysis cycle, with the DMN-decided fields
|
||||
(personal data risk, processing route, redaction level, human
|
||||
validation status) attached. Personal data is NEVER included in
|
||||
the emitted payload — only the deterministic classification fields.
|
||||
|
||||
version: 1.0.0
|
||||
name: Process completion event emitter
|
||||
intent: |
|
||||
Publishes a CloudEvents 1.0-conformant event marking the completion of one semantic analysis cycle, with the DMN-decided fields (personal data risk, processing route, redaction level, human validation status) attached. Personal data is NEVER included in the emitted payload — only the deterministic classification fields.
|
||||
algorithm_kind: emitter
|
||||
|
||||
io:
|
||||
inputs:
|
||||
- id: event_type
|
||||
@@ -23,42 +16,55 @@ io:
|
||||
outputs:
|
||||
- id: published
|
||||
type: boolean
|
||||
|
||||
implementation:
|
||||
type: external
|
||||
medium: mcp_tool
|
||||
uri: "uapf-ip://capability/event.emit@1"
|
||||
hash: "sha256:0000000000000000000000000000000000000000000000000000000000000000"
|
||||
uri: uapf-ip://capability/event.emit@1
|
||||
hash: sha256:0000000000000000000000000000000000000000000000000000000000000000
|
||||
runtime:
|
||||
capability: "event.emit@1"
|
||||
cloud_events_spec: "1.0"
|
||||
|
||||
capability: event.emit@1
|
||||
cloud_events_spec: '1.0'
|
||||
determinism: deterministic
|
||||
side_effects: writes_state
|
||||
|
||||
confidence:
|
||||
type: none
|
||||
|
||||
complexity:
|
||||
typical_latency_ms: 25
|
||||
max_latency_ms: 1000
|
||||
|
||||
failure_mode: "throw — process must complete reliably or fail loudly."
|
||||
|
||||
failure_mode: throw — process must complete reliably or fail loudly.
|
||||
reference:
|
||||
standard: "CloudEvents 1.0"
|
||||
url: "https://github.com/cloudevents/spec/blob/v1.0/spec.md"
|
||||
|
||||
standard: CloudEvents 1.0
|
||||
url: https://github.com/cloudevents/spec/blob/v1.0/spec.md
|
||||
owners:
|
||||
- type: team
|
||||
- type: team
|
||||
id: uapf-stewards
|
||||
contact: stewards@uapf.dev
|
||||
|
||||
lifecycle:
|
||||
status: draft
|
||||
since: "2026-05-20"
|
||||
|
||||
since: '2026-05-20'
|
||||
audit:
|
||||
log_inputs: full
|
||||
log_outputs: full
|
||||
retention: "1y"
|
||||
retention: 1y
|
||||
tests:
|
||||
- name: Successful analysis completion
|
||||
description: Standard happy-path completion event with full payload.
|
||||
inputs:
|
||||
event_type: dev.dokumenta.semantic_analysis.completed
|
||||
payload:
|
||||
document_id: doc-2026-05-21-001
|
||||
outcome: ok
|
||||
confidence: 0.87
|
||||
expected_outputs:
|
||||
published: true
|
||||
- name: Analysis failure completion
|
||||
description: Failure-path completion event still emits successfully (the emitter
|
||||
does not gate on payload contents).
|
||||
inputs:
|
||||
event_type: dev.dokumenta.semantic_analysis.failed
|
||||
payload:
|
||||
document_id: doc-2026-05-21-002
|
||||
outcome: extraction_failed
|
||||
reason: low_confidence
|
||||
expected_outputs:
|
||||
published: true
|
||||
|
||||
@@ -1,17 +1,10 @@
|
||||
kind: uapf.algorithm.card
|
||||
|
||||
id: algo.semantic_document_analysis.pii_redactor
|
||||
version: "1.0.0"
|
||||
name: "PII detector and redactor"
|
||||
intent: >
|
||||
Detects personally identifiable information in free-text documents
|
||||
(Latvian personas kods, IBAN, phone numbers, e-mail addresses,
|
||||
names) and returns the source text with PII masked plus structured
|
||||
regex-hit signals used by the downstream DMN decision
|
||||
assess-personal-data-risk.
|
||||
|
||||
version: 1.0.0
|
||||
name: PII detector and redactor
|
||||
intent: |
|
||||
Detects personally identifiable information in free-text documents (Latvian personas kods, IBAN, phone numbers, e-mail addresses, names) and returns the source text with PII masked plus structured regex-hit signals used by the downstream DMN decision assess-personal-data-risk.
|
||||
algorithm_kind: redactor
|
||||
|
||||
io:
|
||||
inputs:
|
||||
- id: content
|
||||
@@ -19,14 +12,14 @@ io:
|
||||
cardinality: single
|
||||
constraints:
|
||||
maxLength: 200000
|
||||
documentation: "Raw document text submitted for semantic analysis."
|
||||
documentation: Raw document text submitted for semantic analysis.
|
||||
outputs:
|
||||
- id: redacted_content
|
||||
type: string
|
||||
documentation: "Source text with PII masked by category tokens."
|
||||
documentation: Source text with PII masked by category tokens.
|
||||
- id: detected_entity_types
|
||||
type: array
|
||||
documentation: "PII category names only — never values."
|
||||
documentation: PII category names only — never values.
|
||||
- id: personas_koda_present
|
||||
type: boolean
|
||||
- id: financial_data_present
|
||||
@@ -35,53 +28,90 @@ io:
|
||||
type: boolean
|
||||
- id: pii_category_count
|
||||
type: integer
|
||||
constraints: { minimum: 0 }
|
||||
|
||||
constraints:
|
||||
minimum: 0
|
||||
implementation:
|
||||
type: external
|
||||
medium: mcp_tool
|
||||
uri: "uapf-ip://capability/ai.redact@1"
|
||||
hash: "sha256:0000000000000000000000000000000000000000000000000000000000000000"
|
||||
uri: uapf-ip://capability/ai.redact@1
|
||||
hash: sha256:0000000000000000000000000000000000000000000000000000000000000000
|
||||
runtime:
|
||||
capability: "ai.redact@1"
|
||||
note: "Host-fulfilled UAPF-IP capability. Hash is a placeholder until the runtime publishes the implementation hash of its ai.redact handler."
|
||||
|
||||
capability: ai.redact@1
|
||||
note: Host-fulfilled UAPF-IP capability. Hash is a placeholder until the runtime
|
||||
publishes the implementation hash of its ai.redact handler.
|
||||
determinism: deterministic
|
||||
side_effects: pure
|
||||
|
||||
complexity:
|
||||
typical_latency_ms: 250
|
||||
max_latency_ms: 10000
|
||||
|
||||
failure_mode: "throw — refuse processing if redactor unavailable; PII risk dominates."
|
||||
|
||||
failure_mode: throw — refuse processing if redactor unavailable; PII risk dominates.
|
||||
limitations:
|
||||
- "Latviešu valodas personu vārdi atpazīstami ~92% gadījumu"
|
||||
- "Pieņem, ka teksts jau ir digitāls — OCR nav iekļauta"
|
||||
|
||||
- Latviešu valodas personu vārdi atpazīstami ~92% gadījumu
|
||||
- Pieņem, ka teksts jau ir digitāls — OCR nav iekļauta
|
||||
reference:
|
||||
legal: "GDPR 2016/679 5. pants (datu minimizēšana); Fizisko personu datu apstrādes likums."
|
||||
standard: "NIST SP 800-188 — De-Identification of Personal Information."
|
||||
|
||||
legal: GDPR 2016/679 5. pants (datu minimizēšana); Fizisko personu datu apstrādes
|
||||
likums.
|
||||
standard: NIST SP 800-188 — De-Identification of Personal Information.
|
||||
owners:
|
||||
- type: role
|
||||
- type: role
|
||||
id: data_protection_officer
|
||||
contact: stewards@uapf.dev
|
||||
|
||||
lifecycle:
|
||||
status: draft
|
||||
since: "2026-05-20"
|
||||
|
||||
since: '2026-05-20'
|
||||
audit:
|
||||
log_inputs: redacted
|
||||
log_outputs: full
|
||||
retention: "7y"
|
||||
|
||||
retention: 7y
|
||||
privacy:
|
||||
processesPII: true
|
||||
technique: pseudonymization
|
||||
reidentificationRisk: low
|
||||
|
||||
risk:
|
||||
aiActRiskClass: limited
|
||||
humanOversight: advisory
|
||||
tests:
|
||||
- name: Latvian personas kods inline in text
|
||||
description: Standard 11-character Latvian personal identity code (NNNNNN-NNNNN)
|
||||
should be detected and redacted.
|
||||
inputs:
|
||||
content: 'Lūgums izskatīt iesniegumu. Iesniedzējs: Jānis Bērziņš, personas kods:
|
||||
010101-12345. Adrese: Brīvības iela 1, Rīga.'
|
||||
expected_outputs:
|
||||
redacted_content: 'Lūgums izskatīt iesniegumu. Iesniedzējs: [NAME], personas kods:
|
||||
[REDACTED]. Adrese: [ADDRESS].'
|
||||
detected_entity_types:
|
||||
- PERSONAS_KODS
|
||||
- PERSON
|
||||
- ADDRESS
|
||||
personas_koda_present: true
|
||||
financial_data_present: false
|
||||
contact_data_present: true
|
||||
pii_category_count: 3
|
||||
- name: Plain administrative text with no PII
|
||||
description: Generic administrative paragraph; nothing to redact. Verifies the redactor
|
||||
doesn't false-positive on plain text.
|
||||
inputs:
|
||||
content: Iesniegums tiek izskatīts atbilstoši normatīvajiem aktiem. Lēmums tiks
|
||||
paziņots noteiktajā kārtībā.
|
||||
expected_outputs:
|
||||
redacted_content: Iesniegums tiek izskatīts atbilstoši normatīvajiem aktiem. Lēmums
|
||||
tiks paziņots noteiktajā kārtībā.
|
||||
detected_entity_types: []
|
||||
personas_koda_present: false
|
||||
financial_data_present: false
|
||||
contact_data_present: false
|
||||
pii_category_count: 0
|
||||
- name: Financial figures and account numbers
|
||||
description: EUR amounts and IBAN — both detected as financial PII; no personas_kods.
|
||||
inputs:
|
||||
content: Maksājums EUR 1250.00 pārskaitīts uz kontu LV80BANK0000435195001.
|
||||
expected_outputs:
|
||||
redacted_content: Maksājums EUR [AMOUNT] pārskaitīts uz kontu [IBAN].
|
||||
detected_entity_types:
|
||||
- AMOUNT
|
||||
- IBAN
|
||||
personas_koda_present: false
|
||||
financial_data_present: true
|
||||
contact_data_present: false
|
||||
pii_category_count: 2
|
||||
|
||||
@@ -1,18 +1,10 @@
|
||||
kind: uapf.algorithm.card
|
||||
|
||||
id: algo.semantic_document_analysis.vdvc_semantic_extractor
|
||||
version: "1.0.0"
|
||||
name: "VDVC semantic metadata extractor"
|
||||
intent: >
|
||||
Extracts a VDVC v1.1-conformant structured semantic summary from
|
||||
the redacted document text — primary topic, keywords,
|
||||
classification, summary, sensitivity signals. Output validates
|
||||
against resources/schemas/vdvc-semantic-summary.schema.json. This
|
||||
is the sole model-inference step in the process; everything else
|
||||
in the package is deterministic.
|
||||
|
||||
version: 1.0.0
|
||||
name: VDVC semantic metadata extractor
|
||||
intent: |
|
||||
Extracts a VDVC v1.1-conformant structured semantic summary from the redacted document text — primary topic, keywords, classification, summary, sensitivity signals. Output validates against resources/schemas/vdvc-semantic-summary.schema.json. This is the sole model-inference step in the process; everything else in the package is deterministic.
|
||||
algorithm_kind: extractor
|
||||
|
||||
io:
|
||||
inputs:
|
||||
- id: redacted_content
|
||||
@@ -20,69 +12,108 @@ io:
|
||||
cardinality: single
|
||||
constraints:
|
||||
maxLength: 200000
|
||||
documentation: "Output of the upstream PII redactor."
|
||||
documentation: Output of the upstream PII redactor.
|
||||
- id: schema_ref
|
||||
type: string
|
||||
documentation: "Path to the JSON Schema the output must validate against."
|
||||
documentation: Path to the JSON Schema the output must validate against.
|
||||
outputs:
|
||||
- id: semantic_summary
|
||||
type: object
|
||||
schema: "../resources/schemas/vdvc-semantic-summary.schema.json"
|
||||
schema: ../resources/schemas/vdvc-semantic-summary.schema.json
|
||||
- id: sensitivity_control
|
||||
type: object
|
||||
- id: ai_confidence_score
|
||||
type: probability
|
||||
- id: output_pii_error_count
|
||||
type: integer
|
||||
constraints: { minimum: 0 }
|
||||
|
||||
constraints:
|
||||
minimum: 0
|
||||
implementation:
|
||||
type: external
|
||||
medium: llm_prompt
|
||||
uri: "uapf-ip://capability/ai.extract@1"
|
||||
hash: "sha256:0000000000000000000000000000000000000000000000000000000000000000"
|
||||
uri: uapf-ip://capability/ai.extract@1
|
||||
hash: sha256:0000000000000000000000000000000000000000000000000000000000000000
|
||||
runtime:
|
||||
capability: "ai.extract@1"
|
||||
note: "Host-fulfilled UAPF-IP capability. Specific model identity and prompt hash are runtime concerns of the host; the Card declares the contract, not the implementation choice."
|
||||
|
||||
capability: ai.extract@1
|
||||
note: Host-fulfilled UAPF-IP capability. Specific model identity and prompt hash
|
||||
are runtime concerns of the host; the Card declares the contract, not the implementation
|
||||
choice.
|
||||
determinism: stochastic
|
||||
side_effects: external_call
|
||||
|
||||
confidence:
|
||||
type: probability
|
||||
threshold: 0.70
|
||||
below_threshold: "route-to:human.legal_reviewer (enforced by DMN human-validation-gate)"
|
||||
|
||||
threshold: 0.7
|
||||
below_threshold: route-to:human.legal_reviewer (enforced by DMN human-validation-gate)
|
||||
complexity:
|
||||
typical_latency_ms: 8000
|
||||
max_latency_ms: 60000
|
||||
|
||||
failure_mode: "default:null + flag — DMN human-validation-gate routes low-confidence outputs to PENDING_REVIEW."
|
||||
|
||||
failure_mode: default:null + flag — DMN human-validation-gate routes low-confidence
|
||||
outputs to PENDING_REVIEW.
|
||||
limitations:
|
||||
- "Garie dokumenti (>50 000 znaki) tiek apgriezti — pirmie 50K + pēdējie 5K"
|
||||
- "Nav juridisks vērtējums — tikai semantiska klasifikācija"
|
||||
- "Latviešu valodas juridiskā retorika var samazināt recall"
|
||||
|
||||
- Garie dokumenti (>50 000 znaki) tiek apgriezti — pirmie 50K + pēdējie 5K
|
||||
- Nav juridisks vērtējums — tikai semantiska klasifikācija
|
||||
- Latviešu valodas juridiskā retorika var samazināt recall
|
||||
reference:
|
||||
legal: "EU AI Act 2024/1689, Pielikums III (augstā riska MI sistēmas), 13. pants (caurspīdība)."
|
||||
url: "https://eur-lex.europa.eu/eli/reg/2024/1689/oj"
|
||||
|
||||
legal: EU AI Act 2024/1689, Pielikums III (augstā riska MI sistēmas), 13. pants
|
||||
(caurspīdība).
|
||||
url: https://eur-lex.europa.eu/eli/reg/2024/1689/oj
|
||||
owners:
|
||||
- type: team
|
||||
- type: team
|
||||
id: uapf-stewards
|
||||
contact: stewards@uapf.dev
|
||||
|
||||
lifecycle:
|
||||
status: draft
|
||||
since: "2026-05-20"
|
||||
|
||||
since: '2026-05-20'
|
||||
audit:
|
||||
log_inputs: redacted
|
||||
log_outputs: full
|
||||
retention: "7y"
|
||||
|
||||
retention: 7y
|
||||
risk:
|
||||
aiActRiskClass: high
|
||||
humanOversight: mandatory
|
||||
transparencyTier: tier-3-full
|
||||
tests:
|
||||
- name: Regulatory iesniegums about administrative decision
|
||||
description: Typical Latvian administrative complaint with redacted PII. The extractor
|
||||
should identify topic + risk + applicable regulation.
|
||||
inputs:
|
||||
redacted_content: Iesniedzējs [NAME] iesniedza sūdzību par būvvaldes lēmumu Nr.
|
||||
12345 atteikt būvatļauju adresē [ADDRESS]. Tiek lūgts pārskatīt lēmumu.
|
||||
schema_ref: schemas/iesniegums/v1
|
||||
expected_outputs:
|
||||
semantic_summary:
|
||||
topic: construction-permit-appeal
|
||||
subject_area: administrative-law
|
||||
applicable_regulations:
|
||||
- BL
|
||||
- APL
|
||||
language: lv
|
||||
sensitivity_control:
|
||||
contains_decision_reference: true
|
||||
external_communication_recommended: false
|
||||
ai_confidence_score: 0.87
|
||||
output_pii_error_count: 0
|
||||
tolerance:
|
||||
ai_confidence_score: 0.1
|
||||
output_pii_error_count: 0
|
||||
- name: Non-regulatory thank-you note
|
||||
description: Out-of-domain input. Extractor should yield low-confidence summary
|
||||
and a sensitivity flag that no decision is referenced.
|
||||
inputs:
|
||||
redacted_content: Paldies par jūsu pakalpojumu! Bija ļoti patīkami sadarboties
|
||||
ar [NAME] no jūsu komandas.
|
||||
schema_ref: schemas/iesniegums/v1
|
||||
expected_outputs:
|
||||
semantic_summary:
|
||||
topic: non-actionable-correspondence
|
||||
subject_area: feedback
|
||||
applicable_regulations: []
|
||||
language: lv
|
||||
sensitivity_control:
|
||||
contains_decision_reference: false
|
||||
external_communication_recommended: false
|
||||
ai_confidence_score: 0.62
|
||||
output_pii_error_count: 0
|
||||
tolerance:
|
||||
ai_confidence_score: 0.15
|
||||
output_pii_error_count: 0
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
"name": "Semantic Document Analysis",
|
||||
"description": "Level-4 UAPF process for semantic analysis of free-text documents.\n\nThree BPMN service tasks invoke the UAPF-IP capabilities ai.redact@1,\nai.extract@1 and event.emit@1. Three DMN decision tables encode the\ndeterministic algorithm the host previously hid inside application\ncode: assess-personal-data-risk maps PII regex signals to a risk\nlevel; gdpr-processing-route selects CENTRAL vs LOCAL processing,\nanonymisation and redaction level; human-validation-gate applies the\nconfidence thresholds that decide REJECTED / PENDING_REVIEW /\nAPPROVED_AUTO.\n\nOnly the semantic extraction is a model step. Risk classification,\nGDPR routing and the validation gate are explicit ranked rules in\nversioned DMN \u2014 inspectable, auditable, portable. Extraction output\nvalidates against the VDVC v1.1 semantic-summary JSON Schema.\n\nv3.1.0: aligned with UAPF v2.4.0 \u2014 Algorithm Card references move\nfrom resource targets to the BPMN service tasks themselves (via\nuapf24:algorithmCardRef attribute). Each card's io block is also\ndenormalised into a <bpmn:ioSpecification> on the task so inputs\nand outputs render as visible data objects on the diagram. The\ncards themselves and the DMN decisions are unchanged from v3.0.0.\n",
|
||||
"level": 4,
|
||||
"version": "3.1.0",
|
||||
"version": "3.2.0",
|
||||
"requires_capabilities": [
|
||||
"ai.redact@1+",
|
||||
"ai.extract@1+",
|
||||
|
||||
Reference in New Issue
Block a user