Mark Crovella's Publications

Back to list
@techreport{FrancoEtAl:arxiv26b,
  author	= {Franco, Gabriel and Tassis, Lucas M. and Rohr, Azalea and Crovella, Mark},
  title		= {Finding Highly Interpretable Prompt-Specific Circuits in Language Models},
  year		= {2026},
  URL		= {https://arxiv.org/abs/2602.13483},
  doi		= {10.48550/arXiv.2602.13483},
  abstract	= {Understanding the internal circuits that language models use to solve tasks remains a central challenge in mechanistic interpretability. Most prior work identifies circuits at the \emph{task level} by averaging across many prompts, implicitly assuming a single stable mechanism per task. We show that this assumption can obscure a crucial source of structure: \emph{circuits are prompt-specific}, even within a fixed task. Building on attention causal communication (ACC) \c{i}te{franco2025pinpointing}, we introduce ACC++, refinements that extract cleaner, lower-dimensional \emph{causal} signals inside attention heads from a \emph{single forward pass}. Like ACC, our approach does not require replacement models (e.g., SAEs) or activation patching; ACC++ further improves circuit precision by reducing attribution noise. Applying ACC++ to indirect object identification (IOI) in GPT-2, Pythia, and Gemma 2, we find there is no single circuit for IOI in any model: different prompt templates induce systematically different mechanisms. Despite this variation, prompts cluster into \emph{prompt families} with similar circuits, and we propose a \emph{representative circuit} for each family as a practical unit of analysis. Finally, we develop an automated interpretability pipeline that uses ACC++ signals to surface human-interpretable features and assemble mechanistic explanations for prompt families behavior. Together, our results recast circuits as a meaningful object of study by shifting the unit of analysis from tasks to prompts, enabling scalable circuit descriptions in the presence of prompt-specific mechanisms.}
}