diff --git a/README.md b/README.md index 24fef66..813b88b 100644 --- a/README.md +++ b/README.md @@ -1 +1,36 @@ -# Nano-VLLM \ No newline at end of file +# Nano-vLLM + +A lightweight vLLM implementation built from scratch. + +## Key Features + +* πŸš€ **Fase offline inference** - Comparable inference speeds to vLLM +* πŸ“– **Readable codebase** - Clean implementation under 1,200 lines of Python code +* ⚑ **Optimization Suite** - Prefix caching, Torch compilation, CUDA graph, etc + +## Installation + +```bash +pip install git+https://github.com/GeeeekExplorer/nano-vllm.git +``` + +## Quick Start + +See `example.py` for usage. The API mirrors vLLM's interface with minor differences in the `LLM.generate` method. + +## Benchmark + +See `bench.py` for benchmark. + +**Test Configuration:** +- Hardware: RTX 4070 +- Model: Qwen3-0.6B +- Total Requests: 256 sequences +- Input Length: Randomly sampled between 100–1024 tokens +- Output Length: Randomly sampled between 100–1024 tokens + +**Performance Results:** +| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) | +|----------------|-------------|----------|-----------------------| +| vLLM | 133,966 | 98.95 | 1353.86 | +| Nano-vLLM | 133,966 | 101.90 | 1314.65 | diff --git a/bench.py b/bench.py index 4b4b4a3..99b0cf5 100644 --- a/bench.py +++ b/bench.py @@ -24,4 +24,4 @@ llm.generate(prompt_token_ids, sampling_params) t = (time.time() - t) total_tokens = sum(sp.max_tokens for sp in sampling_params) throughput = total_tokens / t -print(f"Total: {total_tokens}, Time: {t:.2f}s, Throughput: {throughput: .2f}") +print(f"Total: {total_tokens}tok, Time: {t:.2f}s, Throughput: {throughput: .2f}tok/s") diff --git a/example.py b/example.py index 7e46349..fef1f30 100644 --- a/example.py +++ b/example.py @@ -9,8 +9,8 @@ llm = LLM(path, enforce_eager=True) sampling_params = SamplingParams(temperature=0.6, max_tokens=256) prompts = [ - "θ‡ͺζˆ‘δ»‹η»δΈ€δΈ‹ε§οΌ", - "εˆ—ε‡Ί100ε†…ζ‰€ζœ‰η΄ ζ•°", + "introduce yourself", + "list all prime numbers within 100", ] prompts = [ tokenizer.apply_chat_template( diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..696471e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,26 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "nano-vllm" +version = "0.1.0" +authors = [{ name = "Xingkai Yu" }] +license = "MIT" +license-files = ["LICENSE"] +readme = "README.md" +description = "a mimic VLLM implementation from scratch" +requires-python = ">=3.9,<3.13" +dependencies = [ + "torch>=2.4.0", + "triton>=3.0.0", + "transformers>=4.51.0", + "flash-attn", + "nvidia-ml-py", +] + +[project.urls] +Homepage="https://github.com/GeeeekExplorer/nano-vllm" + +[tool.setuptools] +packages = ["nanovllm"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 402eb01..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -torch -triton -transformers -flash-attn \ No newline at end of file