Governance in docstrings

Risks, owners and definitions declared purely in docstrings, no decorators.

Open the full report ↗

Example code

Source: examples/example_docstring_tagging.py

"""Docstring tagging: declare governance in docstrings, with **no decorator**.

A function whose docstring carries a ``Conformare:`` block is, by that fact, asking to
be tracked. With function-call capture on (``cf.track_functions()``), conformare detects
that block when the function is called, opens a documented context for it automatically
-- using the **function name** as the label -- and applies the parsed purpose, owner and
risks. So the steps below are *plain functions*: no ``@describe``, no arguments threaded
through the calls. The governance lives entirely in the docstrings.

(The ``@cf.describe("Label")`` decorator remains available if you'd rather not enable the
global function hook; it reads the same docstring block.)

Run:  python examples/example_docstring_tagging.py
Then open docstring_report.html.
"""

import os
import tempfile

import narwhals as nw
import pandas as pd

import conformare as cf

UK_REGIONS = ["London", "Manchester", "Leeds", "Bristol", "Glasgow"]
ALL_REGIONS = UK_REGIONS + ["Dublin", "Paris"]


# --- plain pipeline steps: governance declared only in the docstrings --------
def clean_customers(df):
    """Keep adults resident in the UK.

    Drops under-18s and any non-UK region. `region` is the **billing** region, so a
    customer travelling abroad is still counted under their home region.

    Conformare:
        Purpose: Adults (18+) resident in the UK
        Owner: data-governance
        Risk: privacy.pii_exposure | name + email retained through cleaning | mitigation: Drop email before export | owner: data-governance
        Risk: compliance.gdpr | severity: high | UK-resident PII processed end to end
    """
    return df.filter(nw.col("age") >= 18).filter(nw.col("region").is_in(UK_REGIONS))


def watch_time_by_region(customers, streams):
    """Average watch minutes per UK region.

    Inner-joins cleaned customers to streams and averages `watch_minutes` per region.

    Conformare:
        Purpose: Average watch minutes per UK region
        Owner: analytics-product
        Risk: quality.outliers | very short sessions skew the mean | mitigation: report median alongside mean
    """
    joined = customers.join(streams, on="customer_id", how="inner")
    return joined.group_by("region").agg(nw.col("watch_minutes").mean().alias("avg_watch_minutes"))


def top_customers(customers, streams):
    """Per-customer watch-time leaderboard.

    Sums `watch_minutes` per customer, keyed by name for the leaderboard view.

    Conformare:
        Purpose: Per-customer total watch time for leaderboards
        Owner: analytics-product
        Risk: privacy.pii_exposure | output keyed by customer name | mitigation: Pseudonymise before sharing
    """
    joined = customers.join(streams, on="customer_id", how="inner")
    return joined.group_by("customer_id", "full_name").agg(
        nw.col("watch_minutes").sum().alias("total_watch_minutes")
    )


def _data(n_customers=240, n_streams=720):
    customers = pd.DataFrame(
        {
            "customer_id": range(n_customers),
            "full_name": [f"Customer {i}" for i in range(n_customers)],
            "email": [f"user{i}@example.com" for i in range(n_customers)],
            "region": [ALL_REGIONS[i % len(ALL_REGIONS)] for i in range(n_customers)],
            "age": [16 + (i * 7) % 50 for i in range(n_customers)],
        }
    )
    streams = pd.DataFrame(
        {
            "customer_id": [i % n_customers for i in range(n_streams)],
            "movie_id": [i % 25 for i in range(n_streams)],
            "watch_minutes": [float((i * 13) % 200) for i in range(n_streams)],
        }
    )
    return customers, streams


def main(out=None):
    out = out or os.path.join(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
        "output",
        "docstring_report.html",
    )
    os.makedirs(os.path.dirname(out), exist_ok=True)
    cf.trackNarwhals()
    cf.track_functions()  # capture function calls -> auto-document Conformare-tagged ones
    cf.set_profiles(
        {
            "*": [
                cf.rowCount,
                cf.dataSize,
                cf.nullFraction(columns="all"),
                cf.histogram(columns=["age", "watch_minutes", "avg_watch_minutes"]),
                cf.iqrOutliers(columns=["age", "watch_minutes", "avg_watch_minutes"]),
            ]
        }
    )
    cf.mark_sensitive("region", tag="location", category="Location", severity="low")
    cf.describe_process(
        "Streaming analytics whose governance (purpose, owners, risks) is declared in "
        "each step's docstring via the Conformare: tagging standard — the pipeline code "
        "has no decorators or governance arguments at all."
    )

    cust_pd, streams_pd = _data()
    customers = nw.from_native(cust_pd)
    streams = nw.from_native(streams_pd)

    uk_adults = clean_customers(customers)
    by_region = watch_time_by_region(uk_adults, streams)
    leaders = top_customers(uk_adults, streams)

    tmp = tempfile.mkdtemp(prefix="ft_docstring_")
    leaders.write_csv(os.path.join(tmp, "top_customers.csv"))

    html = cf.to_html(
        path=out, title="Docstring-tagged pipeline (no decorators) — conformare report"
    )
    m = cf.build_model(cf.store)
    print(f"wrote {out} ({len(html):,} bytes)")
    print(
        f"  nodes={m['stats']['nodes']}, contexts={len(m['groups'])}, "
        f"risks={m['stats']['risks']} (governance: {m['stats']['governance']})"
    )
    for cid, g in m["groups"].items():
        print(f"    context '{g['label']}' — owner={g['definition_owner']}")
    cf.restore()
    _ = (by_region, leaders)
    return out


if __name__ == "__main__":
    main()

Output report

Open in a new tab ↗


This site uses Just the Docs, a documentation theme for Jekyll.