Making a function opaque

Contain a multi-step helper as one lineage node with @cf.opaque (or cf.opaque(fn)(df) inline, or cf.opaque_module for a whole library). Two runs: the same feature block tracked normally vs. made opaque – the internals collapse to a single boundary node while the output columns are still captured.

Open the full report ↗

Example code

Source: examples/example_opaque_function.py

"""Making a function opaque: contain a multi-step helper as ONE lineage node.

By default conformare tracks every DataFrame operation, so a chatty helper shows up as
several nodes. Marking it **opaque** records it as a single boundary node -- input ->
output, with the output columns still captured and profiled -- while suppressing its
internal steps. Reach for it when a feature block clutters the diagram, or when a library
call's internals would break the lineage.

Three ways to make a function opaque (all shown or noted below):

1. ``@cf.opaque`` decorator on your own function::

       @cf.opaque
       def engineer(df): ...

2. ``cf.opaque(fn)(df)`` inline, to wrap a call you do not own::

       out = cf.opaque(model.transform)(df)

3. ``cf.opaque_module("pkg")`` to make every function from a module opaque (this is how
   ``pyspark.ml`` is contained automatically)::

       cf.opaque_module("mycompany.features")

This example runs the same feature block twice -- tracked normally, then contained -- and
writes a report for each so you can compare.

Run:  python examples/example_opaque_function.py
Then open output/opaque_on_report.html (and opaque_off_report.html for the contrast).
"""

import os

import pandas as pd

import conformare as cf

REGIONS = ["England", "Scotland", "Wales", "Northern Ireland"]


def _customers(n: int = 240) -> pd.DataFrame:
    rows = [
        (i, REGIONS[i % len(REGIONS)], 18 + (i * 7) % 55, 1 + (i % 12), float(20 + (i * 17) % 280))
        for i in range(n)
    ]
    return pd.DataFrame(rows, columns=["customer_id", "region", "age", "tenure", "spend"])


def engineer_features(df):
    """A multi-step feature block. Tracked normally, each step below is its own node."""
    df = df.assign(spend_per_tenure=df["spend"] / df["tenure"])
    df = df.assign(decade=(df["age"] // 10) * 10)
    df = df[df["spend_per_tenure"] > 0]
    return df.assign(high_value=df["spend"] >= 150)


@cf.opaque
def engineer_features_contained(df):
    """Same logic as ``engineer_features`` -- but ``@cf.opaque`` records it as ONE node and
    hides the internal steps. (It just delegates, so there is a single source of truth.)"""
    return engineer_features(df)


def _run(out, title, *, contained):
    # Fresh state for each variant (the gallery runs the file once; we run twice).
    cf.restore()
    cf.store.clear()
    cf.reset_context()

    cf.trackPandas()
    cf.set_profiles({"*": [cf.rowCount]})
    cf.describe_process(title)

    df = _customers()
    if contained:
        scored = engineer_features_contained(df)  # the @cf.opaque version (one node)
        # The inline form is equivalent when you cannot decorate the function:
        #     scored = cf.opaque(engineer_features)(df)
    else:
        scored = engineer_features(df)  # tracked normally (every step a node)

    html = cf.to_html(out, title=title)
    ops = [e.op for e in cf.lineage() if e.kind == "op"]
    fns = [e.op for e in cf.lineage() if e.kind == "function"]
    out_cols = sorted(cf.store.cols.get(scored._ft_node_id, []))
    cf.restore()
    return html, ops, fns, out_cols


def main(out_dir=None):
    out_dir = out_dir or os.path.join(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output"
    )
    os.makedirs(out_dir, exist_ok=True)
    off = os.path.join(out_dir, "opaque_off_report.html")
    on = os.path.join(out_dir, "opaque_on_report.html")

    _, off_ops, _, off_cols = _run(off, "Without @cf.opaque (internals tracked)", contained=False)
    _, on_ops, on_fns, on_cols = _run(on, "With @cf.opaque (one boundary node)", contained=True)

    print(f"wrote {off}")
    print(f"  tracked normally : {len(off_ops)} op nodes -> {off_ops}")
    print(f"wrote {on}")
    print(f"  made opaque      : {len(on_ops)} op nodes, {len(on_fns)} boundary -> {on_fns}")
    print(f"  output columns identical either way: {off_cols == on_cols} ({on_cols})")
    return on


if __name__ == "__main__":
    main()

Making a function opaque

Example code

Output report

Report

Report