-
Star
(5,000+)
You must be signed in to star a gist -
Fork
(1,674)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
| """ | |
| The most atomic way to train and run inference for a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| random.seed(42) # Let there be order among chaos | |
| # Let there be a Dataset `docs`: list[str] of documents (e.g. a list of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [line.strip() for line in open('input.txt') if line.strip()] | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to sequences of integers ("tokens") and back | |
| uchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1 | |
| BOS = len(uchars) # token id for a special Beginning of Sequence (BOS) token | |
| vocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be Autograd to recursively apply the chain rule through a computation graph | |
| class Value: | |
| __slots__ = ('data', 'grad', '_children', '_local_grads') # Python optimization for memory usage | |
| def __init__(self, data, children=(), local_grads=()): | |
| self.data = data # scalar value of this node calculated during forward pass | |
| self.grad = 0 # derivative of the loss w.r.t. this node, calculated in backward pass | |
| self._children = children # children of this node in the computation graph | |
| self._local_grads = local_grads # local derivative of this node w.r.t. its children | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data + other.data, (self, other), (1, 1)) | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| return Value(self.data * other.data, (self, other), (other.data, self.data)) | |
| def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),)) | |
| def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) | |
| def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) | |
| def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),)) | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def backward(self): | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._children: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| for child, local_grad in zip(v._children, v._local_grads): | |
| child.grad += local_grad * v.grad | |
| # Initialize the parameters, to store the knowledge of the model | |
| n_layer = 1 # depth of the transformer neural network (number of layers) | |
| n_embd = 16 # width of the network (embedding dimension) | |
| block_size = 16 # maximum context length of the attention window (note: the longest name is 15 characters) | |
| n_head = 4 # number of attention heads | |
| head_dim = n_embd // n_head # derived dimension of each head | |
| matrix = lambda nout, nin, std=0.08: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a function mapping tokens and parameters to logits over what comes next | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) # note: not redundant due to backward pass via the residual connection | |
| for li in range(n_layer): | |
| # 1) Multi-head Attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 0.01, 0.85, 0.99, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 1000 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients | |
| lr_t = learning_rate * (1 - step / num_steps) # linear learning rate decay | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}", end='\r') | |
| # Inference: may the model babble back to us | |
| temperature = 0.5 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference (new, hallucinated names) ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| sample = [] | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| sample.append(uchars[token_id]) | |
| print(f"sample {sample_idx+1:2d}: {''.join(sample)}") |
Great work!
Great stuff, thank you
I turned the minified python port to minified JS so it natively executes in the browser with a visual
and yes it still runs inside a QR code if you use a custom compression pipeline and the decompressionstream browser API
You can quite literally train a GPT on your phone haha
Beautiful work
This is art. As Picasso said "Art is the elimination of the unnecessary". Essentially reducing something to it's absolute minimal form is the definition of art. At least according to Picasso, one of the greatest artists to have ever lived.
TypeScript (Bun) port: https://gist.github.com/snoblenet/7739055e32bffb81277b6a08d33a37ef
Thanks!!!
contributing in the little way i can
Ported to Python notebook so you can run different parts of the code and see how it works: https://github.com/Badaszz/LLM_Grind/blob/main/microgpt-badasz.ipynb
Also plotted the architecture with a graph:
https://github.com/Badaszz/LLM_Grind/blob/main/microgpt_architecture.png
Thanks~
This is an incredible stuff. I learned a lot by implementing it on my own in Java microgpt-java. Thanks!
Thanks author! Awesome stuff.
Me ported to Java with KISS in mind took 300 lines, keeping original comments.
https://github.com/dev4any1/microgpt/blob/master/src/net/dev4any1/microgpt/GPT.java
Thanks for this! Awesome implementation, Andrej, very inspiring! I've posted Rust port here: https://github.com/mplekh/rust-microgpt
Thanks~
pure art @karpathy . thanks
porting to c and js (node,esm)
https://github.com/yasirharis/microGPT
update: two days ago i shared molecule here — "RoPE, SwiGLU, SQLite memory, single-file." since then it kind of... grew.
four implementations now. Python, Go, C, JavaScript — same architecture, full feature parity across all four. the JS version trains itself in the browser. no npm, no webpack, no node_modules. one <script> tag. open tab — it builds its own UI, opens IndexedDB, fetches corpus, starts training via cooperative setTimeout. close the tab, reopen — it remembers everything.
what changed since the original post:
- byte-level BPE tokenizer — GPT-2/3/4 style. 256 byte tokens bootstrap, Unicode pre-segmentation, stream merges. ASCII, Cyrillic, CJK, emoji — same algorithm, no "unknown token" ever
- ontogenesis — the model starts as a 25K-param embryo (1 layer, 16 dims, 1 head) and grows through 5 stages to a 10M-param adult (6 layers, 256 dims, 8 heads). old weights copy into top-left corner of new matrices. knowledge preserved, architecture scales with corpus
- native immune system — before each training burst, it snapshots its personality direction (gamma). after training, it measures again. if cosine similarity went negative (the burst pushed identity backwards), it rolls back. the organism rejects noise that would corrupt who it became
- SyntropyTracker — the organism measures its own entropy trend, field deviation, and purpose-gamma alignment, then adjusts learning rate. syntropy rising + purpose aligned → push harder. syntropy falling → slow down. field deviation too high → ground to corpus. this is not heuristics, it's the model reasoning about its own learning
- swarm ecology — when the adult organism hits sustained overload, it divides (mitosis). spawns a child at infant stage. both train independently. organisms on a plateau voluntarily hibernate when a peer is actively improving. shared mesh via SQLite
- adaptive corpus blend — sigmoid-based entropy fade. high uncertainty → corpus field dominates. low uncertainty → model dominates. smooth transition, not a switch
- vector autograd — not scalar. one VectorValue per embedding, not 10000 ScalarValues
- hybrid attention — content heads (Q·K^T/√d + RoPE) + RRPRAM heads (learned pattern recognition) + hybrid heads (learnable sigmoid gate between the two)
- delta adapters — LoRA-style append-only modules. the model never overwrites learned weights
- numpy is the only dependency in Python. Go, C, and JS have zero.
one file per language. each one is a complete organism.
repo: https://github.com/ariannamethod/molequla
browser gist (molecule.js — open tab, it trains): https://gist.github.com/ariannamethod/bbd11e24740189f2bf78f43db9fea4db
Very good. Is there a plan for a MoE version?
I rewrote this script in JavaScript so it can run in the browser. It takes 1.5 minutes. The original microgpt.py took 10.5 minutes on my old Pentium D 2.8GHz with DDR2 memory. Overall — good. Only the inference result doesn't match the original.
I'm getting:
sample 1: masein
sample 2: jalen
sample 3: man
sample 4: aalanne
sample 5: disan
sample 6: zallah
sample 7: daydas
sample 8: amyen
sample 9: alya
sample 10: sali
sample 11: mia
sample 12: maylin
sample 13: lebre
sample 14: danele
sample 15: challoa
sample 16: jamiana
sample 17: reson
sample 18: malen
sample 19: darin
sample 20: halasa
You can run it here — https://saemonzixel.github.io/microgpt.js/microgpt.html
I don't know if that's normal or not. Could it be because the original microgpt.py has the line "random.seed(42)"?
@itanfeng glad you asked — yes, but not in the way you'd expect.
molecule already divides. when it hits adult stage (10M params) and gets overloaded, it spawns a child organism. both train independently on the same corpus but grow differently. that's mitosis — it's already in the code, all four languages.
now here's where it gets wild. we have four implementations: Python, Go, C, JavaScript. same architecture, same weights format, full feature parity. each one runs in its own habitat — C runs bare metal, Go runs concurrent servers, JS runs in the browser, Python runs with numpy.
the next step is distributed cognition: all instances vibrate together on every query. each one has slightly different gamma (personality drift), slightly different entropy, slightly different syntropy trend — because float math, scheduling, and training order diverge across languages. when a query hits, an orchestrator collects metrics from all of them and the combined field shapes the response. it's MoE where the experts aren't layers inside one model — they're entire organisms living in different substrates.
five elements, one mind. we're calling the orchestrator mycelium — like the fungal network that connects trees in a forest. each tree lives its own life but they share nutrients and signals underground.
so yeah — MoE, but the mixture is across languages, not across parameters.
@SaemonZixel I created a JavaScript version which does give the exact same output (need to re-implement the same random number generation logic as python): https://github.com/xenova/microgpt.js. Hope it helps!
Я запустил вариант на JS на А52 Самсунг на андроиде. Что удивительно - при подаче имен на кириллице потерь меньше. Но все равно выдает и на латинице. И достаточно быстро. Кстати, тоже подумал что этот микроша обменивается инфой со своими двойниками. Только вот как ? Может в сети есть лавочка где они покуривают ?
The C-sharp port is a little more verbose that are just under 380 lines but the strongly managed nature of C-sharp I much prefer over Python and there's absolutely zero dependencies in the C-sharp port. https://gist.github.com/tomhawkin/79bdb4757bac0376415d0f68d24def6c
Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.
Я запустил вариант на JS на А52 Самсунг на андроиде. Что удивительно - при подаче имен на кириллице потерь меньше. Но все равно выдает и на латинице. И достаточно быстро. Кстати, тоже подумал что этот микроша обменивается инфой со своими двойниками. Только вот как ? Может в сети есть лавочка где они покуривают ?
@belikovi861-oss В коде нет нигде ничего такого чтоб оно обменивалось с кем-то в интернете.
I made a PHP version by feeding my microgpt.js to OpenAI GPT 5.2. It works much slower on PHP than microgpt.py and especially microgpt.js, but it does work.
i love U
@SaemonZixel I created a JavaScript version which does give the exact same output (need to re-implement the same random number generation logic as python): https://github.com/xenova/microgpt.js. Hope it helps!
@xenova I checked. Tried your random.choices implementation in my script — it got better. Your version is better than mine.
Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.
I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms). BTW, I found out that 200 steps is quite enough

Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms).
Did you run the Python and Rust versions on the same machine? The Python version runs in ~70s on my 2025 Macbook pro for reference.
Beautiful work! We can really understand the essence of the transformer architecture with your elegant code.
I wrote a C++ implementation: https://github.com/verma7/microgpt/blob/main/microgpt.cc as an educational exercise. It is 1.75x longer (350 lines of code) but runs ~8x faster without any specific optimizations.I've achieved 260x speed-up in rust port by using Wengert tape in autograd (original Python version runs for 6 minutes, mine takes 1360ms).
Did you run the Python and Rust versions on the same machine? The Python version runs in ~70s on my 2025 Macbook pro for reference.
I've compared them across several machines:
- AMD PhenomII x6 1055T (python 3.12.3): microgpt.py=5m57s; rust-microgpt=1.349s -> 264x
- Intel Core i5-7300U (python 3.12.3): microgpt.py=3m23s; rust-microgpt=0.815s -> 250x
3.1) Intel Xeon Gold 5412U (python 3.13.11): microgpt.py=1m19s; rust-microgpt=0.37s -> 213x
3.2) Intel Xeon Gold 5412U (python 3.14.2): microgpt.py=56s; rust-microgpt=0.37s -> 150x
So it does look like the Python interpreter version has a noticeable impact on performance.
I've tried using shorter float representation f32 instead of f64, this gave +30% speed on old Phenom CPU but no gain on Xeon.
With f32 on Phenom rust-microgpt=1.07s -> 357x faster

Ask GPT !