Step 7: register-to-BPMN transcoder tool

Adds tools/register-transcoder — a Python tool that reads a published Valsts Kase accounting-process register (.xlsx/.xlsm) and emits BPMN process skeletons. For a given sub-process it produces one userTask per register step, swimlanes from the RACI columns (placing each step in its Responsible actor's lane), sequence flows reconstructed from the register's own predecessor/successor step references, and synthesised start/end events per entry and exit step. Output is an isExecutable=false skeleton — the deterministic first pass of the transcription pipeline; refinement into a Level 4 executable package is the human/AI-assisted second pass that produced the curated FG3-1/FG3-4/FG3-5 packages. Includes a README and sample-output skeletons emitted from the FG3 register for sub-processes 3.5.2 and 3.5.3.
2026-05-19 21:38:45 +00:00
parent 37000f77f5
commit a608de41ad
4 changed files with 615 additions and 0 deletions
--- a/tools/register-transcoder/transcode.py
+++ b/tools/register-transcoder/transcode.py
@@ -0,0 +1,390 @@
+#!/usr/bin/env python3
+"""
+register-transcoder — Valsts Kase process register (.xlsx/.xlsm) -> BPMN skeleton.
+
+Part of the vk-gramatvediba UAPF workspace. Reads a published Valsts Kase
+"Grāmatvedības uzskeites procesu apraksts" function-group register and emits,
+for any sub-process in it, a BPMN process skeleton: one task per register
+step, swimlanes from the RACI columns, and sequence flows reconstructed from
+the register's own predecessor / successor step references.
+
+The output is a *skeleton*, not an executable package. It is the deterministic
+first pass of the transcription pipeline; turning a skeleton into a Level 4
+executable (explicit gateways, DMN decision extraction, resource mappings,
+package manifest) is the human/AI-assisted refinement step — see the curated
+FG3-1, FG3-4 and FG3-5 packages and docs/methodology.md.
+
+Usage:
+    transcode.py list <register.xlsx>
+    transcode.py emit <register.xlsx> <subprocess> [-o <output.bpmn>]
+
+Examples:
+    transcode.py list   fg3_process.xlsm
+    transcode.py emit   fg3_process.xlsm 3.5.2 -o 3.5.2.skeleton.bpmn
+
+Dependencies: openpyxl.
+"""
+import sys
+import re
+from xml.sax.saxutils import escape
+
+try:
+    import openpyxl
+except ImportError:
+    sys.exit("error: openpyxl is required  (pip install openpyxl)")
+
+BPMN_NS = "http://www.omg.org/spec/BPMN/20100524/MODEL"
+
+# RACI actor columns, in register column order, mapped to BPMN lane ids/names.
+ACTORS = [
+    ("nodarbinatais", "Lane_Nodarbinatais", "Nodarbinātais"),
+    ("iestade", "Lane_Iestade", "Iestāde"),
+    ("vpc", "Lane_VPC", "VPC (Vienotais pakalpojumu centrs)"),
+]
+
+# Header cell texts used to locate columns (substring match, case-insensitive).
+H_PRED = "no procesa darbības soļa"
+H_NR = "nr.p.k"
+H_NAME = "process, apakšprocess"
+H_RACI = "atbildības sadalījums"
+H_DESC = "darbību apraksts"
+H_SYSTEM = "izmantotā is"
+H_DEADLINE = "izpildes termiņš"
+H_OUTPUTS = "sagatavotie dati"
+H_SUCC = "uz procesa darbības soli"
+
+
+def norm_nr(s):
+    """Normalise a step number for matching: trim, drop trailing dots."""
+    return (s or "").strip().strip(".").strip()
+
+
+def san(s):
+    """Sanitise a string into a BPMN NCName fragment."""
+    out = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip()).strip("_")
+    return out or "x"
+
+
+def cell(ws, r, c):
+    if c is None:
+        return ""
+    v = ws.cell(row=r, column=c).value
+    return "" if v is None else str(v).strip()
+
+
+def find_sheet_and_header(wb):
+    """Locate the function-group worksheet and its header row."""
+    for ws in wb.worksheets:
+        for r in range(1, 12):
+            for c in range(1, 20):
+                v = ws.cell(row=r, column=c).value
+                if v and H_NR in str(v).lower():
+                    return ws, r
+    sys.exit("error: could not find a register sheet (no 'Nr.p.k.' header)")
+
+
+def map_columns(ws, hrow):
+    """Map logical fields to column indices using the header row."""
+    cols = {}
+    for c in range(1, ws.max_column + 1):
+        t = (ws.cell(row=hrow, column=c).value or "")
+        t = str(t).lower().strip()
+        if not t:
+            continue
+        if H_PRED in t:
+            cols["pred_fg"] = c          # predecessor FG-group column
+            cols["pred_nr"] = c + 1      # predecessor step-number sub-column
+        elif H_NR in t:
+            cols["nr"] = c
+        elif H_NAME in t:
+            cols["name"] = c
+        elif H_RACI in t:
+            cols["raci"] = c             # RACI block spans raci, +1, +2
+        elif H_DESC in t:
+            cols["desc"] = c
+        elif H_SYSTEM in t:
+            cols["system"] = c
+        elif H_DEADLINE in t:
+            cols["deadline"] = c
+        elif H_OUTPUTS in t:
+            cols["outputs"] = c
+        elif H_SUCC in t:
+            cols["succ_fg"] = c          # successor FG-group column
+            cols["succ_nr"] = c + 1      # successor step-number sub-column
+    for req in ("nr", "name", "raci"):
+        if req not in cols:
+            sys.exit(f"error: register header is missing the '{req}' column")
+    return cols
+
+
+def parse_refs(fg_cell, nr_cell):
+    """Parse a predecessor/successor cell pair into [(fg, nr_key), ...]."""
+    fgs = [x.strip() for x in str(fg_cell).splitlines() if x.strip()]
+    nrs = [x.strip() for x in str(nr_cell).splitlines() if x.strip()]
+    if not nrs:
+        return []
+    if len(fgs) == 1 and len(nrs) > 1:
+        fgs = fgs * len(nrs)
+    refs = []
+    for i, nr in enumerate(nrs):
+        fg = fgs[i] if i < len(fgs) else (fgs[0] if fgs else "")
+        key = norm_nr(nr)
+        if key:
+            refs.append((fg.upper(), key))
+    return refs
+
+
+def parse_register(path):
+    """Return (steps, subprocesses). Each step is a dict; subprocesses maps
+    a sub-process key -> its register name."""
+    wb = openpyxl.load_workbook(path, data_only=True)
+    ws, hrow = find_sheet_and_header(wb)
+    cols = map_columns(ws, hrow)
+    own_fg = re.sub(r"[^A-Za-z0-9]", "", ws.title).upper()  # e.g. FG3
+
+    steps = []
+    subprocesses = {}
+    current_sub = None
+    for r in range(hrow + 2, ws.max_row + 1):
+        nr = cell(ws, r, cols["nr"])
+        name = cell(ws, r, cols["name"])
+        if not nr or not name:
+            continue
+        raci = [cell(ws, r, cols["raci"] + i) for i in range(3)]
+        desc = cell(ws, r, cols.get("desc"))
+        is_step = bool(desc) or any(raci)
+        if not is_step:
+            # section / sub-process header row
+            current_sub = norm_nr(nr)
+            subprocesses[current_sub] = name
+            continue
+        steps.append({
+            "nr": nr, "key": norm_nr(nr), "name": name,
+            "sub": current_sub, "raci": raci, "desc": desc,
+            "system": cell(ws, r, cols.get("system")),
+            "deadline": cell(ws, r, cols.get("deadline")),
+            "outputs": cell(ws, r, cols.get("outputs")),
+            "pred": parse_refs(cell(ws, r, cols.get("pred_fg")),
+                               cell(ws, r, cols.get("pred_nr"))),
+            "succ": parse_refs(cell(ws, r, cols.get("succ_fg")),
+                               cell(ws, r, cols.get("succ_nr"))),
+            "own_fg": own_fg,
+        })
+    return steps, subprocesses
+
+
+def primary_lane(raci):
+    """Pick the swimlane for a step: the actor that is Responsible ('R')."""
+    for i, v in enumerate(raci):
+        if "R" in v.upper():
+            return ACTORS[i]
+    for i, v in enumerate(raci):
+        if "A" in v.upper():
+            return ACTORS[i]
+    for i, v in enumerate(raci):
+        if v:
+            return ACTORS[i]
+    return ACTORS[2]  # default: VPC
+
+
+def build_flows(group):
+    """Reconstruct in-group sequence flows from predecessor/successor links.
+    Returns a set of (src_key, dst_key)."""
+    keys = {s["key"] for s in group}
+    edges = set()
+    for s in group:
+        for fg, nr in s["pred"]:
+            if nr in keys and nr != s["key"]:
+                edges.add((nr, s["key"]))
+        for fg, nr in s["succ"]:
+            if nr in keys and nr != s["key"]:
+                edges.add((s["key"], nr))
+    return edges
+
+
+def doc_text(s):
+    """Assemble the <documentation> body for a step's task."""
+    parts = []
+    raci_bits = [f"{ACTORS[i][2].split(' ')[0]}={s['raci'][i]}"
+                 for i in range(3) if s["raci"][i]]
+    parts.append(f"Nr.p.k.: {s['nr']}  |  RACI: " + "; ".join(raci_bits))
+    if s["desc"]:
+        parts.append(s["desc"])
+    meta = []
+    if s["system"]:
+        meta.append("Sistēma: " + s["system"].replace("\n", " "))
+    if s["deadline"]:
+        meta.append("Izpildes termiņš: " + s["deadline"].replace("\n", " "))
+    if s["outputs"]:
+        meta.append("Sagatavotie dati: " + s["outputs"].replace("\n", " "))
+    if meta:
+        parts.append("  |  ".join(meta))
+    ext_p = [f"{fg}/{nr}" for fg, nr in s["pred"]
+             if nr not in s["_groupkeys"]]
+    ext_s = [f"{fg}/{nr}" for fg, nr in s["succ"]
+             if nr not in s["_groupkeys"]]
+    if ext_p:
+        parts.append("Ārējais priekštecis: " + ", ".join(ext_p))
+    if ext_s:
+        parts.append("Ārējais pēctecis: " + ", ".join(ext_s))
+    return "\n".join(parts)
+
+
+def emit_bpmn(steps, subprocesses, sub):
+    group = [s for s in steps if s["sub"] == sub]
+    if not group:
+        avail = ", ".join(sorted(subprocesses)) or "(none)"
+        sys.exit(f"error: no steps for sub-process '{sub}'. Available: {avail}")
+    gkeys = {s["key"] for s in group}
+    for s in group:
+        s["_groupkeys"] = gkeys
+
+    edges = build_flows(group)
+    indeg = {s["key"]: 0 for s in group}
+    outdeg = {s["key"]: 0 for s in group}
+    for a, b in edges:
+        outdeg[a] += 1
+        indeg[b] += 1
+    entries = [s for s in group if indeg[s["key"]] == 0] or [group[0]]
+    exits = [s for s in group if outdeg[s["key"]] == 0] or [group[-1]]
+
+    tid = {s["key"]: "Task_" + san(s["nr"]) for s in group}
+    lanes_used = {}
+    for s in group:
+        lane = primary_lane(s["raci"])
+        s["_lane"] = lane[1]
+        lanes_used.setdefault(lane[1], (lane[1], lane[2]))
+
+    name = subprocesses.get(sub, sub)
+    proc_id = "Process_" + san(sub)
+    L = []
+    L.append('<?xml version="1.0" encoding="UTF-8"?>')
+    L.append('<bpmn:definitions '
+             'xmlns:bpmn="%s" id="Defs_%s" '
+             'targetNamespace="https://uapf.dev/vk-gramatvediba/transcoded">'
+             % (BPMN_NS, san(sub)))
+    L.append('  <bpmn:process id="%s" name="%s" isExecutable="false">'
+             % (proc_id, escape(name)))
+
+    # --- lanes ---
+    node_lane = {}
+    for s in group:
+        node_lane[tid[s["key"]]] = s["_lane"]
+    start_ids = ["Start_%d" % (i + 1) for i in range(len(entries))]
+    end_ids = ["End_%d" % (i + 1) for i in range(len(exits))]
+    L.append('    <bpmn:laneSet id="LaneSet_%s">' % san(sub))
+    # start/end events go in the lane of the step they touch
+    extra = {}
+    for sid, st in zip(start_ids, entries):
+        extra.setdefault(st["_lane"], []).append(sid)
+    for eid, st in zip(end_ids, exits):
+        extra.setdefault(st["_lane"], []).append(eid)
+    for lid, lname in lanes_used.values():
+        L.append('      <bpmn:lane id="%s" name="%s">' % (lid, escape(lname)))
+        for s in group:
+            if s["_lane"] == lid:
+                L.append('        <bpmn:flowNodeRef>%s</bpmn:flowNodeRef>'
+                         % tid[s["key"]])
+        for nid in extra.get(lid, []):
+            L.append('        <bpmn:flowNodeRef>%s</bpmn:flowNodeRef>' % nid)
+        L.append('      </bpmn:lane>')
+    L.append('    </bpmn:laneSet>')
+
+    # --- collect flows: start->entry, edges, exit->end ---
+    flows = []
+    fc = 0
+    incoming = {}
+    outgoing = {}
+
+    def add_flow(src, dst):
+        nonlocal fc
+        fc += 1
+        fid = "Flow_%d" % fc
+        flows.append((fid, src, dst))
+        outgoing.setdefault(src, []).append(fid)
+        incoming.setdefault(dst, []).append(fid)
+        return fid
+
+    for sid, st in zip(start_ids, entries):
+        add_flow(sid, tid[st["key"]])
+    for a, b in sorted(edges):
+        add_flow(tid[a], tid[b])
+    for eid, st in zip(end_ids, exits):
+        add_flow(tid[st["key"]], eid)
+
+    # --- events + tasks ---
+    for sid, st in zip(start_ids, entries):
+        L.append('    <bpmn:startEvent id="%s" name="Ieeja: %s">'
+                 % (sid, escape(st["nr"])))
+        for f in outgoing.get(sid, []):
+            L.append('      <bpmn:outgoing>%s</bpmn:outgoing>' % f)
+        L.append('    </bpmn:startEvent>')
+
+    for s in group:
+        t = tid[s["key"]]
+        L.append('    <bpmn:userTask id="%s" name="%s">'
+                 % (t, escape(s["name"].replace("\n", " "))))
+        L.append('      <bpmn:documentation>%s</bpmn:documentation>'
+                 % escape(doc_text(s)))
+        for f in incoming.get(t, []):
+            L.append('      <bpmn:incoming>%s</bpmn:incoming>' % f)
+        for f in outgoing.get(t, []):
+            L.append('      <bpmn:outgoing>%s</bpmn:outgoing>' % f)
+        L.append('    </bpmn:userTask>')
+
+    for eid, st in zip(end_ids, exits):
+        L.append('    <bpmn:endEvent id="%s" name="Izeja: %s">'
+                 % (eid, escape(st["nr"])))
+        for f in incoming.get(eid, []):
+            L.append('      <bpmn:incoming>%s</bpmn:incoming>' % f)
+        L.append('    </bpmn:endEvent>')
+
+    for fid, src, dst in flows:
+        L.append('    <bpmn:sequenceFlow id="%s" sourceRef="%s" '
+                 'targetRef="%s"/>' % (fid, src, dst))
+
+    L.append('  </bpmn:process>')
+    L.append('</bpmn:definitions>')
+    return "\n".join(L) + "\n"
+
+
+def cmd_list(path):
+    steps, subs = parse_register(path)
+    counts = {}
+    for s in steps:
+        counts[s["sub"]] = counts.get(s["sub"], 0) + 1
+    print(f"register: {path}")
+    print(f"{len(steps)} steps in {len(counts)} sub-process(es) with steps:\n")
+    for sub in sorted(counts):
+        print(f"  {sub:<10} {counts[sub]:>3} step(s)   {subs.get(sub, '')}")
+    print("\nemit a sub-process:  transcode.py emit <register> <subprocess>")
+
+
+def cmd_emit(path, sub, out):
+    steps, subs = parse_register(path)
+    xml = emit_bpmn(steps, subs, sub)
+    if out:
+        with open(out, "w", encoding="utf-8") as fh:
+            fh.write(xml)
+        n = len([s for s in steps if s["sub"] == sub])
+        print(f"wrote {out}  ({n} step(s), sub-process {sub} — {subs.get(sub,'')})")
+    else:
+        sys.stdout.write(xml)
+
+
+def main(argv):
+    if len(argv) < 3 or argv[1] not in ("list", "emit"):
+        sys.exit(__doc__.strip())
+    if argv[1] == "list":
+        cmd_list(argv[2])
+    else:
+        if len(argv) < 4:
+            sys.exit("usage: transcode.py emit <register> <subprocess> [-o out]")
+        out = None
+        if "-o" in argv:
+            out = argv[argv.index("-o") + 1]
+        cmd_emit(argv[2], argv[3], out)
+
+
+if __name__ == "__main__":
+    main(sys.argv)