Step 7: register-to-BPMN transcoder tool
Adds tools/register-transcoder — a Python tool that reads a published Valsts Kase accounting-process register (.xlsx/.xlsm) and emits BPMN process skeletons. For a given sub-process it produces one userTask per register step, swimlanes from the RACI columns (placing each step in its Responsible actor's lane), sequence flows reconstructed from the register's own predecessor/successor step references, and synthesised start/end events per entry and exit step. Output is an isExecutable=false skeleton — the deterministic first pass of the transcription pipeline; refinement into a Level 4 executable package is the human/AI-assisted second pass that produced the curated FG3-1/FG3-4/FG3-5 packages. Includes a README and sample-output skeletons emitted from the FG3 register for sub-processes 3.5.2 and 3.5.3.
This commit is contained in:
390
tools/register-transcoder/transcode.py
Normal file
390
tools/register-transcoder/transcode.py
Normal file
@@ -0,0 +1,390 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
register-transcoder — Valsts Kase process register (.xlsx/.xlsm) -> BPMN skeleton.
|
||||
|
||||
Part of the vk-gramatvediba UAPF workspace. Reads a published Valsts Kase
|
||||
"Grāmatvedības uzskeites procesu apraksts" function-group register and emits,
|
||||
for any sub-process in it, a BPMN process skeleton: one task per register
|
||||
step, swimlanes from the RACI columns, and sequence flows reconstructed from
|
||||
the register's own predecessor / successor step references.
|
||||
|
||||
The output is a *skeleton*, not an executable package. It is the deterministic
|
||||
first pass of the transcription pipeline; turning a skeleton into a Level 4
|
||||
executable (explicit gateways, DMN decision extraction, resource mappings,
|
||||
package manifest) is the human/AI-assisted refinement step — see the curated
|
||||
FG3-1, FG3-4 and FG3-5 packages and docs/methodology.md.
|
||||
|
||||
Usage:
|
||||
transcode.py list <register.xlsx>
|
||||
transcode.py emit <register.xlsx> <subprocess> [-o <output.bpmn>]
|
||||
|
||||
Examples:
|
||||
transcode.py list fg3_process.xlsm
|
||||
transcode.py emit fg3_process.xlsm 3.5.2 -o 3.5.2.skeleton.bpmn
|
||||
|
||||
Dependencies: openpyxl.
|
||||
"""
|
||||
import sys
|
||||
import re
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
try:
|
||||
import openpyxl
|
||||
except ImportError:
|
||||
sys.exit("error: openpyxl is required (pip install openpyxl)")
|
||||
|
||||
BPMN_NS = "http://www.omg.org/spec/BPMN/20100524/MODEL"
|
||||
|
||||
# RACI actor columns, in register column order, mapped to BPMN lane ids/names.
|
||||
ACTORS = [
|
||||
("nodarbinatais", "Lane_Nodarbinatais", "Nodarbinātais"),
|
||||
("iestade", "Lane_Iestade", "Iestāde"),
|
||||
("vpc", "Lane_VPC", "VPC (Vienotais pakalpojumu centrs)"),
|
||||
]
|
||||
|
||||
# Header cell texts used to locate columns (substring match, case-insensitive).
|
||||
H_PRED = "no procesa darbības soļa"
|
||||
H_NR = "nr.p.k"
|
||||
H_NAME = "process, apakšprocess"
|
||||
H_RACI = "atbildības sadalījums"
|
||||
H_DESC = "darbību apraksts"
|
||||
H_SYSTEM = "izmantotā is"
|
||||
H_DEADLINE = "izpildes termiņš"
|
||||
H_OUTPUTS = "sagatavotie dati"
|
||||
H_SUCC = "uz procesa darbības soli"
|
||||
|
||||
|
||||
def norm_nr(s):
|
||||
"""Normalise a step number for matching: trim, drop trailing dots."""
|
||||
return (s or "").strip().strip(".").strip()
|
||||
|
||||
|
||||
def san(s):
|
||||
"""Sanitise a string into a BPMN NCName fragment."""
|
||||
out = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip()).strip("_")
|
||||
return out or "x"
|
||||
|
||||
|
||||
def cell(ws, r, c):
|
||||
if c is None:
|
||||
return ""
|
||||
v = ws.cell(row=r, column=c).value
|
||||
return "" if v is None else str(v).strip()
|
||||
|
||||
|
||||
def find_sheet_and_header(wb):
|
||||
"""Locate the function-group worksheet and its header row."""
|
||||
for ws in wb.worksheets:
|
||||
for r in range(1, 12):
|
||||
for c in range(1, 20):
|
||||
v = ws.cell(row=r, column=c).value
|
||||
if v and H_NR in str(v).lower():
|
||||
return ws, r
|
||||
sys.exit("error: could not find a register sheet (no 'Nr.p.k.' header)")
|
||||
|
||||
|
||||
def map_columns(ws, hrow):
|
||||
"""Map logical fields to column indices using the header row."""
|
||||
cols = {}
|
||||
for c in range(1, ws.max_column + 1):
|
||||
t = (ws.cell(row=hrow, column=c).value or "")
|
||||
t = str(t).lower().strip()
|
||||
if not t:
|
||||
continue
|
||||
if H_PRED in t:
|
||||
cols["pred_fg"] = c # predecessor FG-group column
|
||||
cols["pred_nr"] = c + 1 # predecessor step-number sub-column
|
||||
elif H_NR in t:
|
||||
cols["nr"] = c
|
||||
elif H_NAME in t:
|
||||
cols["name"] = c
|
||||
elif H_RACI in t:
|
||||
cols["raci"] = c # RACI block spans raci, +1, +2
|
||||
elif H_DESC in t:
|
||||
cols["desc"] = c
|
||||
elif H_SYSTEM in t:
|
||||
cols["system"] = c
|
||||
elif H_DEADLINE in t:
|
||||
cols["deadline"] = c
|
||||
elif H_OUTPUTS in t:
|
||||
cols["outputs"] = c
|
||||
elif H_SUCC in t:
|
||||
cols["succ_fg"] = c # successor FG-group column
|
||||
cols["succ_nr"] = c + 1 # successor step-number sub-column
|
||||
for req in ("nr", "name", "raci"):
|
||||
if req not in cols:
|
||||
sys.exit(f"error: register header is missing the '{req}' column")
|
||||
return cols
|
||||
|
||||
|
||||
def parse_refs(fg_cell, nr_cell):
|
||||
"""Parse a predecessor/successor cell pair into [(fg, nr_key), ...]."""
|
||||
fgs = [x.strip() for x in str(fg_cell).splitlines() if x.strip()]
|
||||
nrs = [x.strip() for x in str(nr_cell).splitlines() if x.strip()]
|
||||
if not nrs:
|
||||
return []
|
||||
if len(fgs) == 1 and len(nrs) > 1:
|
||||
fgs = fgs * len(nrs)
|
||||
refs = []
|
||||
for i, nr in enumerate(nrs):
|
||||
fg = fgs[i] if i < len(fgs) else (fgs[0] if fgs else "")
|
||||
key = norm_nr(nr)
|
||||
if key:
|
||||
refs.append((fg.upper(), key))
|
||||
return refs
|
||||
|
||||
|
||||
def parse_register(path):
|
||||
"""Return (steps, subprocesses). Each step is a dict; subprocesses maps
|
||||
a sub-process key -> its register name."""
|
||||
wb = openpyxl.load_workbook(path, data_only=True)
|
||||
ws, hrow = find_sheet_and_header(wb)
|
||||
cols = map_columns(ws, hrow)
|
||||
own_fg = re.sub(r"[^A-Za-z0-9]", "", ws.title).upper() # e.g. FG3
|
||||
|
||||
steps = []
|
||||
subprocesses = {}
|
||||
current_sub = None
|
||||
for r in range(hrow + 2, ws.max_row + 1):
|
||||
nr = cell(ws, r, cols["nr"])
|
||||
name = cell(ws, r, cols["name"])
|
||||
if not nr or not name:
|
||||
continue
|
||||
raci = [cell(ws, r, cols["raci"] + i) for i in range(3)]
|
||||
desc = cell(ws, r, cols.get("desc"))
|
||||
is_step = bool(desc) or any(raci)
|
||||
if not is_step:
|
||||
# section / sub-process header row
|
||||
current_sub = norm_nr(nr)
|
||||
subprocesses[current_sub] = name
|
||||
continue
|
||||
steps.append({
|
||||
"nr": nr, "key": norm_nr(nr), "name": name,
|
||||
"sub": current_sub, "raci": raci, "desc": desc,
|
||||
"system": cell(ws, r, cols.get("system")),
|
||||
"deadline": cell(ws, r, cols.get("deadline")),
|
||||
"outputs": cell(ws, r, cols.get("outputs")),
|
||||
"pred": parse_refs(cell(ws, r, cols.get("pred_fg")),
|
||||
cell(ws, r, cols.get("pred_nr"))),
|
||||
"succ": parse_refs(cell(ws, r, cols.get("succ_fg")),
|
||||
cell(ws, r, cols.get("succ_nr"))),
|
||||
"own_fg": own_fg,
|
||||
})
|
||||
return steps, subprocesses
|
||||
|
||||
|
||||
def primary_lane(raci):
|
||||
"""Pick the swimlane for a step: the actor that is Responsible ('R')."""
|
||||
for i, v in enumerate(raci):
|
||||
if "R" in v.upper():
|
||||
return ACTORS[i]
|
||||
for i, v in enumerate(raci):
|
||||
if "A" in v.upper():
|
||||
return ACTORS[i]
|
||||
for i, v in enumerate(raci):
|
||||
if v:
|
||||
return ACTORS[i]
|
||||
return ACTORS[2] # default: VPC
|
||||
|
||||
|
||||
def build_flows(group):
|
||||
"""Reconstruct in-group sequence flows from predecessor/successor links.
|
||||
Returns a set of (src_key, dst_key)."""
|
||||
keys = {s["key"] for s in group}
|
||||
edges = set()
|
||||
for s in group:
|
||||
for fg, nr in s["pred"]:
|
||||
if nr in keys and nr != s["key"]:
|
||||
edges.add((nr, s["key"]))
|
||||
for fg, nr in s["succ"]:
|
||||
if nr in keys and nr != s["key"]:
|
||||
edges.add((s["key"], nr))
|
||||
return edges
|
||||
|
||||
|
||||
def doc_text(s):
|
||||
"""Assemble the <documentation> body for a step's task."""
|
||||
parts = []
|
||||
raci_bits = [f"{ACTORS[i][2].split(' ')[0]}={s['raci'][i]}"
|
||||
for i in range(3) if s["raci"][i]]
|
||||
parts.append(f"Nr.p.k.: {s['nr']} | RACI: " + "; ".join(raci_bits))
|
||||
if s["desc"]:
|
||||
parts.append(s["desc"])
|
||||
meta = []
|
||||
if s["system"]:
|
||||
meta.append("Sistēma: " + s["system"].replace("\n", " "))
|
||||
if s["deadline"]:
|
||||
meta.append("Izpildes termiņš: " + s["deadline"].replace("\n", " "))
|
||||
if s["outputs"]:
|
||||
meta.append("Sagatavotie dati: " + s["outputs"].replace("\n", " "))
|
||||
if meta:
|
||||
parts.append(" | ".join(meta))
|
||||
ext_p = [f"{fg}/{nr}" for fg, nr in s["pred"]
|
||||
if nr not in s["_groupkeys"]]
|
||||
ext_s = [f"{fg}/{nr}" for fg, nr in s["succ"]
|
||||
if nr not in s["_groupkeys"]]
|
||||
if ext_p:
|
||||
parts.append("Ārējais priekštecis: " + ", ".join(ext_p))
|
||||
if ext_s:
|
||||
parts.append("Ārējais pēctecis: " + ", ".join(ext_s))
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
def emit_bpmn(steps, subprocesses, sub):
|
||||
group = [s for s in steps if s["sub"] == sub]
|
||||
if not group:
|
||||
avail = ", ".join(sorted(subprocesses)) or "(none)"
|
||||
sys.exit(f"error: no steps for sub-process '{sub}'. Available: {avail}")
|
||||
gkeys = {s["key"] for s in group}
|
||||
for s in group:
|
||||
s["_groupkeys"] = gkeys
|
||||
|
||||
edges = build_flows(group)
|
||||
indeg = {s["key"]: 0 for s in group}
|
||||
outdeg = {s["key"]: 0 for s in group}
|
||||
for a, b in edges:
|
||||
outdeg[a] += 1
|
||||
indeg[b] += 1
|
||||
entries = [s for s in group if indeg[s["key"]] == 0] or [group[0]]
|
||||
exits = [s for s in group if outdeg[s["key"]] == 0] or [group[-1]]
|
||||
|
||||
tid = {s["key"]: "Task_" + san(s["nr"]) for s in group}
|
||||
lanes_used = {}
|
||||
for s in group:
|
||||
lane = primary_lane(s["raci"])
|
||||
s["_lane"] = lane[1]
|
||||
lanes_used.setdefault(lane[1], (lane[1], lane[2]))
|
||||
|
||||
name = subprocesses.get(sub, sub)
|
||||
proc_id = "Process_" + san(sub)
|
||||
L = []
|
||||
L.append('<?xml version="1.0" encoding="UTF-8"?>')
|
||||
L.append('<bpmn:definitions '
|
||||
'xmlns:bpmn="%s" id="Defs_%s" '
|
||||
'targetNamespace="https://uapf.dev/vk-gramatvediba/transcoded">'
|
||||
% (BPMN_NS, san(sub)))
|
||||
L.append(' <bpmn:process id="%s" name="%s" isExecutable="false">'
|
||||
% (proc_id, escape(name)))
|
||||
|
||||
# --- lanes ---
|
||||
node_lane = {}
|
||||
for s in group:
|
||||
node_lane[tid[s["key"]]] = s["_lane"]
|
||||
start_ids = ["Start_%d" % (i + 1) for i in range(len(entries))]
|
||||
end_ids = ["End_%d" % (i + 1) for i in range(len(exits))]
|
||||
L.append(' <bpmn:laneSet id="LaneSet_%s">' % san(sub))
|
||||
# start/end events go in the lane of the step they touch
|
||||
extra = {}
|
||||
for sid, st in zip(start_ids, entries):
|
||||
extra.setdefault(st["_lane"], []).append(sid)
|
||||
for eid, st in zip(end_ids, exits):
|
||||
extra.setdefault(st["_lane"], []).append(eid)
|
||||
for lid, lname in lanes_used.values():
|
||||
L.append(' <bpmn:lane id="%s" name="%s">' % (lid, escape(lname)))
|
||||
for s in group:
|
||||
if s["_lane"] == lid:
|
||||
L.append(' <bpmn:flowNodeRef>%s</bpmn:flowNodeRef>'
|
||||
% tid[s["key"]])
|
||||
for nid in extra.get(lid, []):
|
||||
L.append(' <bpmn:flowNodeRef>%s</bpmn:flowNodeRef>' % nid)
|
||||
L.append(' </bpmn:lane>')
|
||||
L.append(' </bpmn:laneSet>')
|
||||
|
||||
# --- collect flows: start->entry, edges, exit->end ---
|
||||
flows = []
|
||||
fc = 0
|
||||
incoming = {}
|
||||
outgoing = {}
|
||||
|
||||
def add_flow(src, dst):
|
||||
nonlocal fc
|
||||
fc += 1
|
||||
fid = "Flow_%d" % fc
|
||||
flows.append((fid, src, dst))
|
||||
outgoing.setdefault(src, []).append(fid)
|
||||
incoming.setdefault(dst, []).append(fid)
|
||||
return fid
|
||||
|
||||
for sid, st in zip(start_ids, entries):
|
||||
add_flow(sid, tid[st["key"]])
|
||||
for a, b in sorted(edges):
|
||||
add_flow(tid[a], tid[b])
|
||||
for eid, st in zip(end_ids, exits):
|
||||
add_flow(tid[st["key"]], eid)
|
||||
|
||||
# --- events + tasks ---
|
||||
for sid, st in zip(start_ids, entries):
|
||||
L.append(' <bpmn:startEvent id="%s" name="Ieeja: %s">'
|
||||
% (sid, escape(st["nr"])))
|
||||
for f in outgoing.get(sid, []):
|
||||
L.append(' <bpmn:outgoing>%s</bpmn:outgoing>' % f)
|
||||
L.append(' </bpmn:startEvent>')
|
||||
|
||||
for s in group:
|
||||
t = tid[s["key"]]
|
||||
L.append(' <bpmn:userTask id="%s" name="%s">'
|
||||
% (t, escape(s["name"].replace("\n", " "))))
|
||||
L.append(' <bpmn:documentation>%s</bpmn:documentation>'
|
||||
% escape(doc_text(s)))
|
||||
for f in incoming.get(t, []):
|
||||
L.append(' <bpmn:incoming>%s</bpmn:incoming>' % f)
|
||||
for f in outgoing.get(t, []):
|
||||
L.append(' <bpmn:outgoing>%s</bpmn:outgoing>' % f)
|
||||
L.append(' </bpmn:userTask>')
|
||||
|
||||
for eid, st in zip(end_ids, exits):
|
||||
L.append(' <bpmn:endEvent id="%s" name="Izeja: %s">'
|
||||
% (eid, escape(st["nr"])))
|
||||
for f in incoming.get(eid, []):
|
||||
L.append(' <bpmn:incoming>%s</bpmn:incoming>' % f)
|
||||
L.append(' </bpmn:endEvent>')
|
||||
|
||||
for fid, src, dst in flows:
|
||||
L.append(' <bpmn:sequenceFlow id="%s" sourceRef="%s" '
|
||||
'targetRef="%s"/>' % (fid, src, dst))
|
||||
|
||||
L.append(' </bpmn:process>')
|
||||
L.append('</bpmn:definitions>')
|
||||
return "\n".join(L) + "\n"
|
||||
|
||||
|
||||
def cmd_list(path):
|
||||
steps, subs = parse_register(path)
|
||||
counts = {}
|
||||
for s in steps:
|
||||
counts[s["sub"]] = counts.get(s["sub"], 0) + 1
|
||||
print(f"register: {path}")
|
||||
print(f"{len(steps)} steps in {len(counts)} sub-process(es) with steps:\n")
|
||||
for sub in sorted(counts):
|
||||
print(f" {sub:<10} {counts[sub]:>3} step(s) {subs.get(sub, '')}")
|
||||
print("\nemit a sub-process: transcode.py emit <register> <subprocess>")
|
||||
|
||||
|
||||
def cmd_emit(path, sub, out):
|
||||
steps, subs = parse_register(path)
|
||||
xml = emit_bpmn(steps, subs, sub)
|
||||
if out:
|
||||
with open(out, "w", encoding="utf-8") as fh:
|
||||
fh.write(xml)
|
||||
n = len([s for s in steps if s["sub"] == sub])
|
||||
print(f"wrote {out} ({n} step(s), sub-process {sub} — {subs.get(sub,'')})")
|
||||
else:
|
||||
sys.stdout.write(xml)
|
||||
|
||||
|
||||
def main(argv):
|
||||
if len(argv) < 3 or argv[1] not in ("list", "emit"):
|
||||
sys.exit(__doc__.strip())
|
||||
if argv[1] == "list":
|
||||
cmd_list(argv[2])
|
||||
else:
|
||||
if len(argv) < 4:
|
||||
sys.exit("usage: transcode.py emit <register> <subprocess> [-o out]")
|
||||
out = None
|
||||
if "-o" in argv:
|
||||
out = argv[argv.index("-o") + 1]
|
||||
cmd_emit(argv[2], argv[3], out)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
||||
Reference in New Issue
Block a user