Tracking installed (wheel) code
Mark your own pip-installed pipeline package as user code so its functions and docstring governance are tracked (e.g. Databricks wheels).
Example code
Source: examples/example_mark_user_packages.py
"""Track your own pipeline code when it is installed as a wheel.
By default conformare treats anything under ``site-packages`` as third-party library
code and does not bracket it with the function hook -- so if your team ships its
pipeline logic as a wheel (common on Databricks), those functions are invisible to
tracking. ``cf.mark_user_packages("yourpkg")`` opts the package back in, so its
functions become governed nodes (and their docstring ``Conformare:`` blocks apply).
This example builds a tiny "installed" company package on the fly (under a
``site-packages`` directory) and runs the same pipeline twice: once **unmarked**
(the package's functions are library code, hidden) and once **marked** (they are
tracked, with governance from their docstrings).
Run: python examples/example_mark_user_packages.py
Then open output/mark_user_packages_report.html.
"""
import os
import sys
import tempfile
import textwrap
import narwhals as nw
import pandas as pd
import conformare as cf
# A stand-in for your pip-installed pipeline package: acme_pipelines.steps
_PACKAGE_SOURCE = textwrap.dedent(
'''
import narwhals as nw
def standardise_customers(df):
"""Keep UK adults and add a decade band.
Conformare:
Purpose: Adults (18+), with a decade band for cohorting
Owner: data-platform
Risk: privacy.pii_exposure | customer rows retained | mitigation: drop name before export | owner: data-governance
"""
return df.filter(nw.col("age") >= 18).with_columns(decade=(nw.col("age") // 10) * 10)
def spend_by_region(df):
"""Average spend per region."""
return df.group_by("region").agg(nw.col("spend").mean())
'''
)
def _install_fake_package() -> str:
"""Write acme_pipelines into a temp site-packages dir and put it on sys.path."""
root = os.path.join(tempfile.mkdtemp(prefix="conformare_wheel_"), "site-packages")
pkg = os.path.join(root, "acme_pipelines")
os.makedirs(pkg, exist_ok=True)
open(os.path.join(pkg, "__init__.py"), "w").close()
with open(os.path.join(pkg, "steps.py"), "w", encoding="utf-8") as fh:
fh.write(_PACKAGE_SOURCE)
sys.path.insert(0, root)
return root
def _run(mark: bool):
"""Run the pipeline; return the number of function-boundary nodes captured."""
cf.store.clear()
cf.reset_context()
from acme_pipelines import steps # installed (site-packages) company code
cf.trackNarwhals()
cf.track_functions(True)
if mark:
cf.mark_user_packages("acme_pipelines")
raw = nw.from_native(
pd.DataFrame(
{
"region": ["Wales", "England", "Scotland", "England"],
"age": [34, 17, 41, 29],
"spend": [120.0, 0.0, 80.0, 200.0],
}
)
)
adults = steps.standardise_customers(raw)
steps.spend_by_region(adults)
# What marking adds: function-boundary nodes + governance groups from the package's
# docstrings. (The inner ops are captured either way -- Narwhals records them.)
governed = sum(1 for e in cf.lineage() if e.kind == "function") + len(cf.groups_registry())
cf.track_functions(False)
cf.restore()
return governed
def main(out=None):
out = out or os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
"output",
"mark_user_packages_report.html",
)
os.makedirs(os.path.dirname(out), exist_ok=True)
_install_fake_package()
unmarked = _run(mark=False) # company functions are library code -> hidden
marked = _run(mark=True) # opted in -> tracked as governed nodes
# The report reflects the marked run (the company code is tracked).
html = cf.to_html(out, title="Tracking installed (wheel) pipeline code")
print(f"wrote {out} ({len(html):,} bytes)")
print(f"governed nodes (boundaries + groups): unmarked={unmarked} marked={marked}")
print(
"Marking the installed package surfaces its functions as governed nodes "
"(with docstring risks); unmarked, they are treated as library internals."
)
return out
if __name__ == "__main__":
main()