Making a function opaque
Contain a multi-step helper as one lineage node with @cf.opaque (or cf.opaque(fn)(df) inline, or cf.opaque_module for a whole library). Two runs: the same feature block tracked normally vs. made opaque – the internals collapse to a single boundary node while the output columns are still captured.
Example code
Source: examples/example_opaque_function.py
"""Making a function opaque: contain a multi-step helper as ONE lineage node.
By default conformare tracks every DataFrame operation, so a chatty helper shows up as
several nodes. Marking it **opaque** records it as a single boundary node -- input ->
output, with the output columns still captured and profiled -- while suppressing its
internal steps. Reach for it when a feature block clutters the diagram, or when a library
call's internals would break the lineage.
Three ways to make a function opaque (all shown or noted below):
1. ``@cf.opaque`` decorator on your own function::
@cf.opaque
def engineer(df): ...
2. ``cf.opaque(fn)(df)`` inline, to wrap a call you do not own::
out = cf.opaque(model.transform)(df)
3. ``cf.opaque_module("pkg")`` to make every function from a module opaque (this is how
``pyspark.ml`` is contained automatically)::
cf.opaque_module("mycompany.features")
This example runs the same feature block twice -- tracked normally, then contained -- and
writes a report for each so you can compare.
Run: python examples/example_opaque_function.py
Then open output/opaque_on_report.html (and opaque_off_report.html for the contrast).
"""
import os
import pandas as pd
import conformare as cf
REGIONS = ["England", "Scotland", "Wales", "Northern Ireland"]
def _customers(n: int = 240) -> pd.DataFrame:
rows = [
(i, REGIONS[i % len(REGIONS)], 18 + (i * 7) % 55, 1 + (i % 12), float(20 + (i * 17) % 280))
for i in range(n)
]
return pd.DataFrame(rows, columns=["customer_id", "region", "age", "tenure", "spend"])
def engineer_features(df):
"""A multi-step feature block. Tracked normally, each step below is its own node."""
df = df.assign(spend_per_tenure=df["spend"] / df["tenure"])
df = df.assign(decade=(df["age"] // 10) * 10)
df = df[df["spend_per_tenure"] > 0]
return df.assign(high_value=df["spend"] >= 150)
@cf.opaque
def engineer_features_contained(df):
"""Same logic as ``engineer_features`` -- but ``@cf.opaque`` records it as ONE node and
hides the internal steps. (It just delegates, so there is a single source of truth.)"""
return engineer_features(df)
def _run(out, title, *, contained):
# Fresh state for each variant (the gallery runs the file once; we run twice).
cf.restore()
cf.store.clear()
cf.reset_context()
cf.trackPandas()
cf.set_profiles({"*": [cf.rowCount]})
cf.describe_process(title)
df = _customers()
if contained:
scored = engineer_features_contained(df) # the @cf.opaque version (one node)
# The inline form is equivalent when you cannot decorate the function:
# scored = cf.opaque(engineer_features)(df)
else:
scored = engineer_features(df) # tracked normally (every step a node)
html = cf.to_html(out, title=title)
ops = [e.op for e in cf.lineage() if e.kind == "op"]
fns = [e.op for e in cf.lineage() if e.kind == "function"]
out_cols = sorted(cf.store.cols.get(scored._ft_node_id, []))
cf.restore()
return html, ops, fns, out_cols
def main(out_dir=None):
out_dir = out_dir or os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output"
)
os.makedirs(out_dir, exist_ok=True)
off = os.path.join(out_dir, "opaque_off_report.html")
on = os.path.join(out_dir, "opaque_on_report.html")
_, off_ops, _, off_cols = _run(off, "Without @cf.opaque (internals tracked)", contained=False)
_, on_ops, on_fns, on_cols = _run(on, "With @cf.opaque (one boundary node)", contained=True)
print(f"wrote {off}")
print(f" tracked normally : {len(off_ops)} op nodes -> {off_ops}")
print(f"wrote {on}")
print(f" made opaque : {len(on_ops)} op nodes, {len(on_fns)} boundary -> {on_fns}")
print(f" output columns identical either way: {off_cols == on_cols} ({on_cols})")
return on
if __name__ == "__main__":
main()