% references.bib. Field Note #5 seed bibliography.
% Curated for: "Disaggregated or Colocated? The Cost-Frontier of LLM Serving
% Under SLO Contracts." Field Notes #5, ifitsmanu.com, 2026.
%
% Selection criteria:
%   - Primary venues per agent profile: SOSP, OSDI, NSDI, MLSys, ATC.
%   - Cross-referenced architecture/economics papers from ISCA and FAST
%     where they are the originating source for a load-bearing claim.
%   - Every entry verified against arXiv or the official venue page; no
%     fabricated IDs.
%
% Citation hygiene per docs/research/00-playbook.md §1: cite primary, carry
% conditions in body text, snapshot URLs via web.archive.org before publish.

% --- Colocated continuous batching and KV-cache management ---

@inproceedings{kwon2023pagedattention,
  author    = {Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng
               and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and
               Hao Zhang and Ion Stoica},
  title     = {Efficient Memory Management for Large Language Model Serving
               with {PagedAttention}},
  booktitle = {Proceedings of the 29th Symposium on Operating Systems
               Principles ({SOSP})},
  year      = {2023},
  doi       = {10.1145/3600006.3613165},
  eprint    = {2309.06180},
  archivePrefix = {arXiv}
}

@inproceedings{yu2022orca,
  author    = {Gyeong-In Yu and Joo Seong Jeong and Geon-Woo Kim and
               Soojeong Kim and Byung-Gon Chun},
  title     = {{Orca}: A Distributed Serving System for Transformer-Based
               Generative Models},
  booktitle = {16th {USENIX} Symposium on Operating Systems Design and
               Implementation ({OSDI})},
  year      = {2022},
  url       = {https://www.usenix.org/conference/osdi22/presentation/yu}
}

% --- Model parallelism for serving ---

@inproceedings{li2023alpaserve,
  author    = {Zhuohan Li and Lianmin Zheng and Yinmin Zhong and Vincent Liu
               and Ying Sheng and Xin Jin and Yanping Huang and Zhifeng Chen
               and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
  title     = {{AlpaServe}: Statistical Multiplexing with Model Parallelism
               for Deep Learning Serving},
  booktitle = {17th {USENIX} Symposium on Operating Systems Design and
               Implementation ({OSDI})},
  year      = {2023},
  eprint    = {2302.11665},
  archivePrefix = {arXiv}
}

% --- Chunked-prefill colocation ---

@inproceedings{agrawal2024sarathi,
  author    = {Amey Agrawal and Nitin Kedia and Ashish Panwar and Jayashree
               Mohan and Nipun Kwatra and Bhargav S. Gulavani and Alexey
               Tumanov and Ramachandran Ramjee},
  title     = {Taming Throughput-Latency Tradeoff in {LLM} Inference with
               {Sarathi-Serve}},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and
               Implementation ({OSDI})},
  year      = {2024},
  eprint    = {2403.02310},
  archivePrefix = {arXiv}
}

% --- Prefill/decode disaggregation ---

@inproceedings{zhong2024distserve,
  author    = {Yinmin Zhong and Shengyu Liu and Junda Chen and Jianbo Hu and
               Yibo Zhu and Xuanzhe Liu and Xin Jin and Hao Zhang},
  title     = {{DistServe}: Disaggregating Prefill and Decoding for
               Goodput-optimized Large Language Model Serving},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and
               Implementation ({OSDI})},
  year      = {2024},
  eprint    = {2401.09670},
  archivePrefix = {arXiv}
}

@inproceedings{sun2024llumnix,
  author    = {Biao Sun and Ziming Huang and Hanyu Zhao and Wencong Xiao and
               Xinyi Zhang and Yong Li and Wei Lin},
  title     = {{Llumnix}: Dynamic Scheduling for Large Language Model
               Serving},
  booktitle = {18th {USENIX} Symposium on Operating Systems Design and
               Implementation ({OSDI})},
  year      = {2024},
  eprint    = {2406.03243},
  archivePrefix = {arXiv}
}

@inproceedings{patel2024splitwise,
  author    = {Pratyush Patel and Esha Choukse and Chaojie Zhang and Aashaka
               Shah and \'I\~nigo Goiri and Saeed Maleki and Ricardo Bianchini},
  title     = {{Splitwise}: Efficient Generative {LLM} Inference Using Phase
               Splitting},
  booktitle = {Proceedings of the 51st Annual International Symposium on
               Computer Architecture ({ISCA})},
  year      = {2024},
  eprint    = {2311.18677},
  archivePrefix = {arXiv},
  note      = {Cross-venue reference. ISCA is not in the primary venue list
               for this note but is the originating publication for the
               phase-splitting cost model.}
}

@inproceedings{qin2025mooncake,
  author    = {Ruoyu Qin and Zheming Li and Weiran He and Mingxing Zhang and
               Yongwei Wu and Weimin Zheng and Xinran Xu},
  title     = {{Mooncake}: A {KVCache}-centric Disaggregated Architecture for
               {LLM} Serving},
  booktitle = {23rd {USENIX} Conference on File and Storage Technologies
               ({FAST})},
  year      = {2025},
  eprint    = {2407.00079},
  archivePrefix = {arXiv},
  note      = {Cross-venue reference. FAST is not in the primary venue list
               but Mooncake is the production-scale anchor for KV-cache
               disaggregation.}
}

% --- Programming model and prefix caching ---

@inproceedings{zheng2024sglang,
  author    = {Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue
               Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos
               Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark
               Barrett and Ying Sheng},
  title     = {{SGLang}: Efficient Execution of Structured Language Model
               Programs},
  booktitle = {Advances in Neural Information Processing Systems
               ({NeurIPS})},
  year      = {2024},
  eprint    = {2312.07104},
  archivePrefix = {arXiv}
}

% --- Heterogeneous-memory and offloading inference ---

@inproceedings{sheng2023flexgen,
  author    = {Ying Sheng and Lianmin Zheng and Binhang Yuan and Zhuohan Li
               and Max Ryabinin and Daniel Y. Fu and Zhiqiang Xie and Beidi
               Chen and Clark Barrett and Joseph E. Gonzalez and Percy Liang
               and Christopher R\'e and Ion Stoica and Ce Zhang},
  title     = {{FlexGen}: High-Throughput Generative Inference of Large
               Language Models with a Single {GPU}},
  booktitle = {Proceedings of the 40th International Conference on Machine
               Learning ({ICML})},
  year      = {2023},
  eprint    = {2303.06865},
  archivePrefix = {arXiv}
}

% --- SLO-aware ML serving (the older NSDI line that the LLM work inherits) ---

@inproceedings{zhang2023shepherd,
  author    = {Hong Zhang and Yupeng Tang and Anurag Khandelwal and Ion
               Stoica},
  title     = {{SHEPHERD}: Serving {DNNs} in the Wild},
  booktitle = {20th {USENIX} Symposium on Networked Systems Design and
               Implementation ({NSDI})},
  year      = {2023},
  url       = {https://www.usenix.org/conference/nsdi23/presentation/zhang-hong}
}

@inproceedings{crankshaw2017clipper,
  author    = {Daniel Crankshaw and Xin Wang and Giulio Zhou and Michael J.
               Franklin and Joseph E. Gonzalez and Ion Stoica},
  title     = {{Clipper}: A Low-Latency Online Prediction Serving System},
  booktitle = {14th {USENIX} Symposium on Networked Systems Design and
               Implementation ({NSDI})},
  year      = {2017},
  url       = {https://www.usenix.org/conference/nsdi17/technical-sessions/presentation/crankshaw}
}

@inproceedings{romero2021infaas,
  author    = {Francisco Romero and Qian Li and Neeraja J. Yadwadkar and
               Christos Kozyrakis},
  title     = {{INFaaS}: Automated Model-less Inference Serving},
  booktitle = {2021 {USENIX} Annual Technical Conference ({USENIX ATC})},
  year      = {2021},
  url       = {https://www.usenix.org/conference/atc21/presentation/romero}
}

% --- Decoding-time parallelism (held fixed in Section 3.4) ---

@inproceedings{leviathan2023speculative,
  author    = {Yaniv Leviathan and Matan Kalman and Yossi Matias},
  title     = {Fast Inference from Transformers via Speculative Decoding},
  booktitle = {Proceedings of the 40th International Conference on Machine
               Learning ({ICML})},
  year      = {2023},
  eprint    = {2211.17192},
  archivePrefix = {arXiv}
}

% --- Adjacent context, added during drafting pass (2026-05-11) ---
% Single-author arXiv preprints. Cited as related-work pointers, not as
% load-bearing primary results. Both verified against arXiv abstract page.

@misc{lysenstoen2026sloguard,
  author       = {Christian Lysenst{\o}en},
  title        = {{SLO-Guard}: Crash-Aware, Budget-Consistent Autotuning for
                  SLO-Constrained {LLM} Serving},
  year         = {2026},
  eprint       = {2604.17627},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2604.17627},
  note         = {arXiv preprint, April 2026. Single-author. Cited as
                  related-work pointer on SLO contracts as an autotuning
                  objective; not a load-bearing primary result.}
}

@misc{ganjihal2026multitierkv,
  author       = {Sanjeev Rao Ganjihal},
  title        = {Predictive Multi-Tier Memory Management for {KV} Cache in
                  Large-Scale {GPU} Inference},
  year         = {2026},
  eprint       = {2604.26968},
  archivePrefix = {arXiv},
  primaryClass = {cs.AR},
  url          = {https://arxiv.org/abs/2604.26968},
  note         = {arXiv preprint, April 2026. Single-author. Cited as
                  related-work pointer on the KV cache as a memory-hierarchy
                  problem, adjacent to but not the focus of this paper's
                  architectural frontier.}
}

% --- Reference model (cited at first mention in Section 3.4) ---

@misc{touvron2023llama2,
  author       = {Hugo Touvron and Louis Martin and Kevin Stone and Peter
                  Albert and Amjad Almahairi and Yasmine Babaei and Nikolay
                  Bashlykov and Soumya Batra and Prajjwal Bhargava and Shruti
                  Bhosale and others},
  title        = {Llama 2: Open Foundation and Fine-Tuned Chat Models},
  year         = {2023},
  eprint       = {2307.09288},
  archivePrefix = {arXiv},
  primaryClass = {cs.CL},
  url          = {https://arxiv.org/abs/2307.09288},
  note         = {Reference model for the frontier (Llama-2-70B, 80 layers,
                  8 KV heads under GQA, 128 head dim, BF16). KV-byte
                  accounting in Section 4.3 derives from these constants.}
}

% --- Hybrid state-space / attention architectures (forward pointer, §8.4) ---

@misc{gu2023mamba,
  author       = {Albert Gu and Tri Dao},
  title        = {{Mamba}: Linear-Time Sequence Modeling with Selective
                  State Spaces},
  year         = {2023},
  eprint       = {2312.00752},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG},
  url          = {https://arxiv.org/abs/2312.00752},
  note         = {Cited in Section 8.4 as a pure state-space architecture
                  on which $\tau_\text{KV}$ collapses, redrawing the
                  break-even surface.}
}

@misc{lieber2024jamba,
  author       = {Opher Lieber and Barak Lenz and Hofit Bata and Gal Cohen
                  and Jhonathan Osin and Itay Dalmedigos and Erez Safahi
                  and Shaked Meirom and Yonatan Belinkov and Shai
                  Shalev-Shwartz and Omri Abend and Raz Alon and Tomer Asida
                  and Amir Bergman and Roman Glozman and Michael Gokhman
                  and Avshalom Manevich and Nir Ratner and Noam Rozen and
                  Erez Shwartz and Mor Zusman and Yoav Shoham},
  title        = {{Jamba}: A Hybrid Transformer-{Mamba} Language Model},
  year         = {2024},
  eprint       = {2403.19887},
  archivePrefix = {arXiv},
  primaryClass = {cs.CL},
  url          = {https://arxiv.org/abs/2403.19887},
  note         = {Cited in Section 8.4 as a Transformer-Mamba hybrid where
                  the partition redraws around the hybrid's effective KV
                  size rather than a pure-attention KV.}
}

% --- Field Note #1 anchor (the prior art this paper extends) ---

@misc{bhardwaj2026inferencestack,
  author       = {Manu Bhardwaj},
  title        = {The Inference Stack in 2026: A Field Note on Token
                  Economics, Runtime Systems, and Model Architecture},
  year         = {2026},
  howpublished = {Field Notes, ifitsmanu.com},
  url          = {https://ifitsmanu.com/research/inference-stack-2026},
  note         = {Version 3.0, 3 May 2026. Introduces Verified Capability
                  per Dollar (VCpD) as the operational unit of inference
                  economics. This paper carries that frame one stack-layer
                  down to the serving architecture.}
}

% --- Adjacent profiling-based simulator, contemporaneous arXiv preprint ---

@misc{dooly2026,
  author       = {Joon Ha Kim and Geon-Woo Kim and Anoop Rachakonda and
                  Daehyeok Kim},
  title        = {{Dooly}: Configuration-Agnostic, Redundancy-Aware Profiling
                  for {LLM} Inference Simulation},
  year         = {2026},
  eprint       = {2605.07985},
  archivePrefix = {arXiv},
  primaryClass = {cs.DC},
  url          = {https://arxiv.org/abs/2605.07985},
  note         = {arXiv preprint, May 2026. Verified against arxiv.org
                  2026-05-11 daily cache. Cited in Section 6.4 as the
                  state-of-the-art profile-database simulator and the
                  natural complement to the closed-form decomposition for
                  extending the frontier to private hardware.}
}

% --- FLOPs scaling reference for prefill-share derivation ---

@inproceedings{hoffmann2022chinchilla,
  author    = {Jordan Hoffmann and Sebastian Borgeaud and Arthur Mensch and
               Elena Buchatskaya and Trevor Cai and Eliza Rutherford and Diego
               de Las Casas and Lisa Anne Hendricks and Johannes Welbl and
               Aidan Clark and Tom Hennigan and Eric Noland and Katie Millican
               and George van den Driessche and Bogdan Damoc and Aurelia Guy
               and Simon Osindero and Karen Simonyan and Erich Elsen and Jack
               W. Rae and Oriol Vinyals and Laurent Sifre},
  title     = {Training Compute-Optimal Large Language Models},
  booktitle = {Advances in Neural Information Processing Systems
               ({NeurIPS})},
  year      = {2022},
  eprint    = {2203.15556},
  archivePrefix = {arXiv},
  note      = {Cited only for the standard $2nP$ FLOPs accounting that
               underwrites the prefill-share derivation in Section 4.2.}
}

% --- Hardware references for KV-bandwidth and PCIe vs NVLink discussion ---

@misc{nvidia2022hopper,
  author       = {{NVIDIA Corporation}},
  title        = {{NVIDIA H100} {Tensor Core GPU} Architecture Whitepaper},
  year         = {2022},
  url          = {https://resources.nvidia.com/en-us-tensor-core/gtc22-whitepaper-hopper},
  note         = {Specifications used: 80 GB HBM3, 3.35 TB/s on-package
                  bandwidth, NVLink 4 at 900 GB/s, PCIe Gen5 at 128 GB/s.}
}

@misc{nvidia2020ampere,
  author       = {{NVIDIA Corporation}},
  title        = {{NVIDIA A100} {Tensor Core GPU} Architecture Whitepaper},
  year         = {2020},
  url          = {https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/nvidia-ampere-architecture-whitepaper.pdf},
  note         = {Specifications used in Section 5.3: A100 SXM4 80 GB,
                  HBM2e at 2.04 TB/s on-package, BF16 dense throughput
                  312 TFLOPS, NVLink 3 at 600 GB/s.}
}

@misc{nvidia2023h200,
  author       = {{NVIDIA Corporation}},
  title        = {{NVIDIA H200} {Tensor Core GPU} Datasheet},
  year         = {2023},
  url          = {https://www.nvidia.com/en-us/data-center/h200/},
  note         = {Specifications used in Section 5.3: H200 SXM5 141 GB
                  HBM3e at 4.80 TB/s on-package, BF16 dense throughput
                  989 TFLOPS (matches H100), NVLink 4 at 900 GB/s.}
}

% --- Public hourly pricing snapshots (rate sheet archived with companion repo) ---

@misc{coreweave2026pricing,
  author       = {{CoreWeave, Inc.}},
  title        = {{CoreWeave} {GPU} Cloud Pricing},
  year         = {2026},
  url          = {https://www.coreweave.com/pricing},
  note         = {Hourly rates pinned to 2026-05-01 observation; rate sheet
                  archived in the companion repository.}
}

@misc{aws2026p5,
  author       = {{Amazon Web Services}},
  title        = {{Amazon EC2 P5} Instance Pricing},
  year         = {2026},
  url          = {https://aws.amazon.com/ec2/instance-types/p5/},
  note         = {On-demand rate for p5.48xlarge (8x H100 SXM) pinned to
                  2026-05-01.}
}