% paper-2/references.bib. The Routing Premium.
% All entries verified against arXiv pages or the daily research cache as of 2026-05-11.
% Entries carried over from /references.bib (Paper #1) are re-pasted here so paper-2 builds
% as a stand-alone artifact.

% ============================================================================
% Cost-correct frame and wedge prior art (carried over from Paper #1).
% ============================================================================

@inproceedings{erol2026costofpass,
  author    = {Umutcan Erol and Jad El and Mirac Suzgun and Mert Yuksekgonul
               and James Zou},
  title     = {The Cost of Being Right: Evaluating Language Models by the
               Cost-of-Pass},
  booktitle = {International Conference on Learning Representations},
  year      = {2026},
  url       = {https://openreview.net/forum?id=vC9S20zsgN},
}

@misc{bhardwaj2026costcorrect,
  author       = {Manu Bhardwaj},
  title        = {The Cost of Being Right. Verification Economics in 2026},
  howpublished = {Field Notes \#2, ifitsmanu.com},
  year         = {2026},
  month        = {May},
  url          = {https://ifitsmanu.com/papers/the-cost-of-being-right},
}

@misc{bhardwaj2026alpha,
  author       = {Manu Bhardwaj},
  title        = {The {$\alpha$} Asymmetry. Why Verifiers Can Be Smaller Than Generators},
  howpublished = {Field Notes \#3, ifitsmanu.com},
  year         = {2026},
  month        = {May},
  url          = {https://ifitsmanu.com/papers/the-alpha-asymmetry},
}

@misc{bhardwaj2026inferencefrontier,
  author       = {Manu Bhardwaj},
  title        = {The Inference-Time Compute Frontier.
                  A Cost-Correct Threshold for Training Versus Test-Time Allocation},
  howpublished = {Working paper, ifitsmanu.com},
  year         = {2026},
  month        = {May},
  note         = {Paper \#1 of the Inference Economics wedge.},
  url          = {https://ifitsmanu.com/papers/the-inference-time-compute-frontier},
}

@article{snell2024testtime,
  author    = {Charlie Snell and Jaehoon Lee and Kelvin Xu and Aviral Kumar},
  title     = {Scaling {LLM} Test-Time Compute Optimally Can be More Effective
               than Scaling Model Parameters},
  journal   = {arXiv preprint arXiv:2408.03314},
  year      = {2024},
  url       = {https://arxiv.org/abs/2408.03314},
}

@article{brown2024monkeys,
  author    = {Bradley Brown and Jordan Juravsky and Ryan Ehrlich and Ronald Clark and
               Quoc V. Le and Christopher Re and Azalia Mirhoseini},
  title     = {Large Language Monkeys: Scaling Inference Compute with Repeated Sampling},
  journal   = {arXiv preprint arXiv:2407.21787},
  year      = {2024},
  url       = {https://arxiv.org/abs/2407.21787},
}

@techreport{aiindex2025,
  author      = {{Stanford Human-Centered AI Institute}},
  title       = {{AI} Index Report 2025},
  institution = {Stanford University},
  year        = {2025},
  url         = {https://hai.stanford.edu/ai-index/2025-ai-index-report},
}

@article{lightman2023prm800k,
  author    = {Hunter Lightman and Vineet Kosaraju and Yura Burda and Harri Edwards and
               Bowen Baker and Teddy Lee and Jan Leike and John Schulman and
               Ilya Sutskever and Karl Cobbe},
  title     = {Let's Verify Step by Step},
  journal   = {arXiv preprint arXiv:2305.20050},
  year      = {2023},
  url       = {https://arxiv.org/abs/2305.20050},
}

% ============================================================================
% Speculative decoding (per-token routing).
% ============================================================================

@inproceedings{leviathan2023speculative,
  author    = {Yaniv Leviathan and Matan Kalman and Yossi Matias},
  title     = {Fast Inference from Transformers via Speculative Decoding},
  booktitle = {International Conference on Machine Learning},
  year      = {2023},
  url       = {https://arxiv.org/abs/2211.17192},
}

@article{chen2023drafter,
  author    = {Charlie Chen and Sebastian Borgeaud and Geoffrey Irving and
               Jean-Baptiste Lespiau and Laurent Sifre and John Jumper},
  title     = {Accelerating Large Language Model Decoding with Speculative Sampling},
  journal   = {arXiv preprint arXiv:2302.01318},
  year      = {2023},
  url       = {https://arxiv.org/abs/2302.01318},
}

@article{cai2024medusa,
  author    = {Tianle Cai and Yuhong Li and Zhengyang Geng and Hongwu Peng and
               Jason D. Lee and Deming Chen and Tri Dao},
  title     = {Medusa: Simple {LLM} Inference Acceleration Framework with
               Multiple Decoding Heads},
  journal   = {arXiv preprint arXiv:2401.10774},
  year      = {2024},
  url       = {https://arxiv.org/abs/2401.10774},
}

@article{li2024eagle,
  author    = {Yuhui Li and Fangyun Wei and Chao Zhang and Hongyang Zhang},
  title     = {{EAGLE}: Speculative Sampling Requires Rethinking Feature Uncertainty},
  journal   = {arXiv preprint arXiv:2401.15077},
  year      = {2024},
  url       = {https://arxiv.org/abs/2401.15077},
}

% ============================================================================
% Cascade routing.
% ============================================================================

@article{chen2023frugalgpt,
  author    = {Lingjiao Chen and Matei Zaharia and James Zou},
  title     = {{FrugalGPT}: How to Use Large Language Models While Reducing
               Cost and Improving Performance},
  journal   = {arXiv preprint arXiv:2305.05176},
  year      = {2023},
  url       = {https://arxiv.org/abs/2305.05176},
}

@article{zhan2026mpd2,
  author    = {Wenxin Zhan},
  title     = {{MPD}$^2$-Router: Mask-aware Multi-expert Prior-regularized
               Dual-head Deferral Router in Glaucoma Screening and Diagnosis},
  journal   = {arXiv preprint arXiv:2605.08024},
  year      = {2026},
  url       = {https://arxiv.org/abs/2605.08024},
}

% ============================================================================
% Adaptive sampling and complexity-aware exploration.
% ============================================================================

@article{petullo2026veccisc,
  author    = {James Petullo and Sonny George and Dylan Cashman and Nianwen Xue},
  title     = {{VecCISC}: Improving Confidence-Informed Self-Consistency with
               Reasoning Trace Clustering and Candidate Answer Selection},
  journal   = {arXiv preprint arXiv:2605.08070},
  year      = {2026},
  url       = {https://arxiv.org/abs/2605.08070},
}

@article{petullo2026casql,
  author    = {James Petullo and Nianwen Xue},
  title     = {{CA-SQL}: Complexity-Aware Inference Time Reasoning for
               Text-to-{SQL} via Exploration and Compute Budget Allocation},
  journal   = {arXiv preprint arXiv:2605.08057},
  year      = {2026},
  url       = {https://arxiv.org/abs/2605.08057},
}

% ============================================================================
% Early-exit and adaptive depth.
% ============================================================================

@article{schuster2022calm,
  author    = {Tal Schuster and Adam Fisch and Jai Gupta and Mostafa Dehghani and
               Dara Bahri and Vinh Q. Tran and Yi Tay and Donald Metzler},
  title     = {Confident Adaptive Language Modeling},
  journal   = {arXiv preprint arXiv:2207.07061},
  year      = {2022},
  note      = {Also in NeurIPS 2022.},
  url       = {https://arxiv.org/abs/2207.07061},
}

% ============================================================================
% Serving infrastructure and workload heterogeneity.
% ============================================================================

@article{patel2024splitwise,
  author    = {Pratyush Patel and Esha Choukse and Chaojie Zhang and
               Aashaka Shah and Inigo Goiri and Saeed Maleki and
               Ricardo Bianchini},
  title     = {Splitwise: Efficient Generative {LLM} Inference Using Phase Splitting},
  journal   = {arXiv preprint arXiv:2311.18677},
  year      = {2024},
  url       = {https://arxiv.org/abs/2311.18677},
}

@article{agrawal2024sarathi,
  author    = {Amey Agrawal and Nitin Kedia and Ashish Panwar and Jayashree Mohan and
               Nipun Kwatra and Bhargav Gulavani and Alexey Tumanov and
               Ramachandran Ramjee},
  title     = {Taming Throughput-Latency Tradeoff in {LLM} Inference with
               {Sarathi-Serve}},
  journal   = {arXiv preprint arXiv:2403.02310},
  year      = {2024},
  url       = {https://arxiv.org/abs/2403.02310},
}

@article{slogard2026,
  author    = {Christian Lysenst{\o}en},
  title     = {{SLO-Guard}: Crash-Aware, Budget-Consistent Autotuning for
               {SLO}-Constrained {LLM} Serving},
  journal   = {arXiv preprint arXiv:2604.17627},
  year      = {2026},
  url       = {https://arxiv.org/abs/2604.17627},
}

@article{dooly2026inferencesim,
  author    = {Joon Ha Kim and Geon-Woo Kim and Anoop Rachakonda and
               Daehyeok Kim},
  title     = {Dooly: Configuration-Agnostic, Redundancy-Aware Profiling for
               {LLM} Inference Simulation},
  journal   = {arXiv preprint arXiv:2605.07985},
  year      = {2026},
  url       = {https://arxiv.org/abs/2605.07985},
}

@article{li2026sameservice,
  author    = {Haorui Li and Zhenghui He and Xuanzi Liu and Yang Xu and
               Dongsheng Liu and Jiakang Ma and Lupan Wu and Yangjie Wu and
               Xiongchao Tang and Tianhui Shi},
  title     = {When Is the Same Model Not the Same Service?
               A Measurement Study of Hosted Open-Weight {LLM} {API}s},
  journal   = {arXiv preprint arXiv:2605.02821},
  year      = {2026},
  url       = {https://arxiv.org/abs/2605.02821},
}
