Tracking installed (wheel) code

Mark your own pip-installed pipeline package as user code so its functions and docstring governance are tracked (e.g. Databricks wheels).

Open the full report ↗

Example code

Source: examples/example_mark_user_packages.py

"""Track your own pipeline code when it is installed as a wheel.

By default conformare treats anything under ``site-packages`` as third-party library
code and does not bracket it with the function hook -- so if your team ships its
pipeline logic as a wheel (common on Databricks), those functions are invisible to
tracking. ``cf.mark_user_packages("yourpkg")`` opts the package back in, so its
functions become governed nodes (and their docstring ``Conformare:`` blocks apply).

This example builds a tiny "installed" company package on the fly (under a
``site-packages`` directory) and runs the same pipeline twice: once **unmarked**
(the package's functions are library code, hidden) and once **marked** (they are
tracked, with governance from their docstrings).

Run:  python examples/example_mark_user_packages.py
Then open output/mark_user_packages_report.html.
"""

import os
import sys
import tempfile
import textwrap

import narwhals as nw
import pandas as pd

import conformare as cf

# A stand-in for your pip-installed pipeline package: acme_pipelines.steps
_PACKAGE_SOURCE = textwrap.dedent(
    '''
    import narwhals as nw


    def standardise_customers(df):
        """Keep UK adults and add a decade band.

        Conformare:
            Purpose: Adults (18+), with a decade band for cohorting
            Owner: data-platform
            Risk: privacy.pii_exposure | customer rows retained | mitigation: drop name before export | owner: data-governance
        """
        return df.filter(nw.col("age") >= 18).with_columns(decade=(nw.col("age") // 10) * 10)


    def spend_by_region(df):
        """Average spend per region."""
        return df.group_by("region").agg(nw.col("spend").mean())
    '''
)


def _install_fake_package() -> str:
    """Write acme_pipelines into a temp site-packages dir and put it on sys.path."""
    root = os.path.join(tempfile.mkdtemp(prefix="conformare_wheel_"), "site-packages")
    pkg = os.path.join(root, "acme_pipelines")
    os.makedirs(pkg, exist_ok=True)
    open(os.path.join(pkg, "__init__.py"), "w").close()
    with open(os.path.join(pkg, "steps.py"), "w", encoding="utf-8") as fh:
        fh.write(_PACKAGE_SOURCE)
    sys.path.insert(0, root)
    return root


def _run(mark: bool):
    """Run the pipeline; return the number of function-boundary nodes captured."""
    cf.store.clear()
    cf.reset_context()
    from acme_pipelines import steps  # installed (site-packages) company code

    cf.trackNarwhals()
    cf.track_functions(True)
    if mark:
        cf.mark_user_packages("acme_pipelines")

    raw = nw.from_native(
        pd.DataFrame(
            {
                "region": ["Wales", "England", "Scotland", "England"],
                "age": [34, 17, 41, 29],
                "spend": [120.0, 0.0, 80.0, 200.0],
            }
        )
    )
    adults = steps.standardise_customers(raw)
    steps.spend_by_region(adults)

    # What marking adds: function-boundary nodes + governance groups from the package's
    # docstrings. (The inner ops are captured either way -- Narwhals records them.)
    governed = sum(1 for e in cf.lineage() if e.kind == "function") + len(cf.groups_registry())
    cf.track_functions(False)
    cf.restore()
    return governed


def main(out=None):
    out = out or os.path.join(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
        "output",
        "mark_user_packages_report.html",
    )
    os.makedirs(os.path.dirname(out), exist_ok=True)

    _install_fake_package()

    unmarked = _run(mark=False)  # company functions are library code -> hidden
    marked = _run(mark=True)  # opted in -> tracked as governed nodes

    # The report reflects the marked run (the company code is tracked).
    html = cf.to_html(out, title="Tracking installed (wheel) pipeline code")
    print(f"wrote {out} ({len(html):,} bytes)")
    print(f"governed nodes (boundaries + groups): unmarked={unmarked}  marked={marked}")
    print(
        "Marking the installed package surfaces its functions as governed nodes "
        "(with docstring risks); unmarked, they are treated as library internals."
    )
    return out


if __name__ == "__main__":
    main()

Output report

Open in a new tab ↗


This site uses Just the Docs, a documentation theme for Jekyll.