Bootstrapped (unmodified script)
Instrument a production PySpark script from the outside, source untouched.
Example code
Source: examples/bootstrap/bootstrap.py
"""Bootstrap entry point: document & track ``pipeline.py`` WITHOUT modifying it.
With ``conformare.bootstrap`` the whole bootstrap collapses to: declare which
functions to document (with their properties) and what to run. The helper handles
``trackSpark``, profilers, decorating each function in place (across modules, any
import style), running the original script, and writing the report.
Nothing in ``pipeline.py`` or ``utils.py`` is edited.
Run: python bootstrap.py (from the examples/bootstrap folder)
Then open bootstrap_report.html.
"""
import os
import conformare as cf
from conformare.bootstrap import bootstrap, doc
import pipeline # the unmodified production script
import utils # a second unmodified module it depends on
NUMERIC = ["age", "watch_minutes", "avg_watch_minutes", "total_watch_minutes"]
# repo-root output/ folder (examples/bootstrap -> examples -> repo root)
_OUTDIR = os.path.join(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "output"
)
REPORT = os.path.join(_OUTDIR, "bootstrap_report.html")
if __name__ == "__main__":
os.makedirs(_OUTDIR, exist_ok=True)
bootstrap(
run=pipeline.main, # run the original script, untouched
backend="spark",
profiles={
"*": [
cf.rowCount,
cf.dataSize,
cf.nullFraction(columns="all"),
cf.histogram(columns=NUMERIC),
cf.iqrOutliers(columns=NUMERIC),
]
},
process="Streaming analytics pipeline (instrumented at bootstrap, source "
"scripts untouched): clean UK adult customers, then report watch time "
"by region and top customers.",
sensitive={"region": dict(tag="location", category="Location", severity="low")},
# Just declare each function + its documentation/governance properties.
# doc() takes the function object, so utils helpers are decorated wherever
# they're referenced regardless of import style.
docs=[
doc(
utils.standardise,
"Standardise customers",
purpose="Normalise region casing and lowercase email",
definition_owner="data-engineering",
),
doc(
utils.flag_minors,
"Flag minors",
purpose="Mark under-18 customers",
definition_owner="data-governance",
),
# clean_customers carries its own governance in a Conformare: docstring
# block, so the declaration here needs only a label.
doc(pipeline.clean_customers, "Clean customers"),
doc(
pipeline.watch_time_by_region,
"Watch time by region",
purpose="Average watch minutes per UK region",
definition_owner="analytics-product",
),
doc(
pipeline.top_customers,
"Top customers",
purpose="Total watch minutes per customer for leaderboards",
definition_owner="analytics-product",
risks=cf.risk(
"privacy.pii_exposure",
note="output keyed by customer name",
mitigation="Pseudonymise before sharing",
),
),
],
report=REPORT,
title="Bootstrapped streaming pipeline — conformare report",
)