Governance in docstrings
Risks, owners and definitions declared purely in docstrings, no decorators.
Example code
Source: examples/example_docstring_tagging.py
"""Docstring tagging: declare governance in docstrings, with **no decorator**.
A function whose docstring carries a ``Conformare:`` block is, by that fact, asking to
be tracked. With function-call capture on (``cf.track_functions()``), conformare detects
that block when the function is called, opens a documented context for it automatically
-- using the **function name** as the label -- and applies the parsed purpose, owner and
risks. So the steps below are *plain functions*: no ``@describe``, no arguments threaded
through the calls. The governance lives entirely in the docstrings.
(The ``@cf.describe("Label")`` decorator remains available if you'd rather not enable the
global function hook; it reads the same docstring block.)
Run: python examples/example_docstring_tagging.py
Then open docstring_report.html.
"""
import os
import tempfile
import narwhals as nw
import pandas as pd
import conformare as cf
UK_REGIONS = ["London", "Manchester", "Leeds", "Bristol", "Glasgow"]
ALL_REGIONS = UK_REGIONS + ["Dublin", "Paris"]
# --- plain pipeline steps: governance declared only in the docstrings --------
def clean_customers(df):
"""Keep adults resident in the UK.
Drops under-18s and any non-UK region. `region` is the **billing** region, so a
customer travelling abroad is still counted under their home region.
Conformare:
Purpose: Adults (18+) resident in the UK
Owner: data-governance
Risk: privacy.pii_exposure | name + email retained through cleaning | mitigation: Drop email before export | owner: data-governance
Risk: compliance.gdpr | severity: high | UK-resident PII processed end to end
"""
return df.filter(nw.col("age") >= 18).filter(nw.col("region").is_in(UK_REGIONS))
def watch_time_by_region(customers, streams):
"""Average watch minutes per UK region.
Inner-joins cleaned customers to streams and averages `watch_minutes` per region.
Conformare:
Purpose: Average watch minutes per UK region
Owner: analytics-product
Risk: quality.outliers | very short sessions skew the mean | mitigation: report median alongside mean
"""
joined = customers.join(streams, on="customer_id", how="inner")
return joined.group_by("region").agg(nw.col("watch_minutes").mean().alias("avg_watch_minutes"))
def top_customers(customers, streams):
"""Per-customer watch-time leaderboard.
Sums `watch_minutes` per customer, keyed by name for the leaderboard view.
Conformare:
Purpose: Per-customer total watch time for leaderboards
Owner: analytics-product
Risk: privacy.pii_exposure | output keyed by customer name | mitigation: Pseudonymise before sharing
"""
joined = customers.join(streams, on="customer_id", how="inner")
return joined.group_by("customer_id", "full_name").agg(
nw.col("watch_minutes").sum().alias("total_watch_minutes")
)
def _data(n_customers=240, n_streams=720):
customers = pd.DataFrame(
{
"customer_id": range(n_customers),
"full_name": [f"Customer {i}" for i in range(n_customers)],
"email": [f"user{i}@example.com" for i in range(n_customers)],
"region": [ALL_REGIONS[i % len(ALL_REGIONS)] for i in range(n_customers)],
"age": [16 + (i * 7) % 50 for i in range(n_customers)],
}
)
streams = pd.DataFrame(
{
"customer_id": [i % n_customers for i in range(n_streams)],
"movie_id": [i % 25 for i in range(n_streams)],
"watch_minutes": [float((i * 13) % 200) for i in range(n_streams)],
}
)
return customers, streams
def main(out=None):
out = out or os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"output",
"docstring_report.html",
)
os.makedirs(os.path.dirname(out), exist_ok=True)
cf.trackNarwhals()
cf.track_functions() # capture function calls -> auto-document Conformare-tagged ones
cf.set_profiles(
{
"*": [
cf.rowCount,
cf.dataSize,
cf.nullFraction(columns="all"),
cf.histogram(columns=["age", "watch_minutes", "avg_watch_minutes"]),
cf.iqrOutliers(columns=["age", "watch_minutes", "avg_watch_minutes"]),
]
}
)
cf.mark_sensitive("region", tag="location", category="Location", severity="low")
cf.describe_process(
"Streaming analytics whose governance (purpose, owners, risks) is declared in "
"each step's docstring via the Conformare: tagging standard — the pipeline code "
"has no decorators or governance arguments at all."
)
cust_pd, streams_pd = _data()
customers = nw.from_native(cust_pd)
streams = nw.from_native(streams_pd)
uk_adults = clean_customers(customers)
by_region = watch_time_by_region(uk_adults, streams)
leaders = top_customers(uk_adults, streams)
tmp = tempfile.mkdtemp(prefix="ft_docstring_")
leaders.write_csv(os.path.join(tmp, "top_customers.csv"))
html = cf.to_html(
path=out, title="Docstring-tagged pipeline (no decorators) — conformare report"
)
m = cf.build_model(cf.store)
print(f"wrote {out} ({len(html):,} bytes)")
print(
f" nodes={m['stats']['nodes']}, contexts={len(m['groups'])}, "
f"risks={m['stats']['risks']} (governance: {m['stats']['governance']})"
)
for cid, g in m["groups"].items():
print(f" context '{g['label']}' — owner={g['definition_owner']}")
cf.restore()
_ = (by_region, leaders)
return out
if __name__ == "__main__":
main()