Created
May 23, 2026 06:21
-
-
Save peczenyj/bf0459370621cd7d215d6ecb722c703d to your computer and use it in GitHub Desktop.
Train neural network to find xor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Train 3 neurons to compute XOR, from scratch, in pure NumPy. | |
| Architecture (the one from our diagram): | |
| 2 inputs -> hidden layer of 2 neurons (A, B) -> 1 output neuron (C) | |
| 9 parameters total: each neuron has 2 weights + 1 bias. | |
| Everything is visible: the forward pass, the loss, the gradients, and the | |
| weight update. No PyTorch, nothing hidden. Run with: python xor_train.py | |
| """ | |
| import numpy as np | |
| np.random.seed(1) # reproducible; change/remove to see different random starts | |
| # --------------------------------------------------------------------------- | |
| # 1) The data: XOR truth table. | |
| # X has 4 examples, each with 2 input bits. y is the desired output bit. | |
| # --------------------------------------------------------------------------- | |
| X = np.array([[0, 0], | |
| [0, 1], | |
| [1, 0], | |
| [1, 1]], dtype=float) | |
| y = np.array([[0], | |
| [1], | |
| [1], | |
| [0]], dtype=float) # 0^0=0, 0^1=1, 1^0=1, 1^1=0 | |
| # --------------------------------------------------------------------------- | |
| # 2) The activation function (the "squashing" we discussed) and its slope. | |
| # sigmoid maps any number into (0,1). Its derivative is needed for learning. | |
| # --------------------------------------------------------------------------- | |
| def sigmoid(z): | |
| return 1.0 / (1.0 + np.exp(-z)) | |
| def sigmoid_deriv(a): # given a = sigmoid(z), slope = a*(1-a) | |
| return a * (1.0 - a) | |
| # --------------------------------------------------------------------------- | |
| # 3) The parameters = all the w's and b's. Start them RANDOM (network is dumb). | |
| # W1: weights from 2 inputs into 2 hidden neurons -> shape (2 inputs, 2 neurons) | |
| # b1: one bias per hidden neuron -> shape (1, 2) | |
| # W2: weights from 2 hidden neurons into 1 output -> shape (2, 1) | |
| # b2: one bias for the output neuron -> shape (1, 1) | |
| # Count: 4 + 2 + 2 + 1 = 9 parameters, exactly as in the diagram. | |
| # --------------------------------------------------------------------------- | |
| W1 = np.random.randn(2, 2) | |
| b1 = np.zeros((1, 2)) | |
| W2 = np.random.randn(2, 1) | |
| b2 = np.zeros((1, 1)) | |
| lr = 1.0 # learning rate: how big a nudge per step | |
| epochs = 20000 # how many times we loop over the data | |
| # --------------------------------------------------------------------------- | |
| # 4) The training loop. Each pass: forward (predict) -> measure error -> | |
| # backward (find which way to nudge each parameter) -> update. | |
| # --------------------------------------------------------------------------- | |
| for epoch in range(epochs): | |
| # ----- FORWARD PASS: run the 4 examples through the network ----- | |
| z1 = X @ W1 + b1 # weighted sums into hidden neurons A and B | |
| a1 = sigmoid(z1) # hidden activations (outputs of A and B) | |
| z2 = a1 @ W2 + b2 # weighted sum into output neuron C | |
| a2 = sigmoid(z2) # final prediction, one number per example | |
| # ----- LOSS: mean squared error between prediction and target ----- | |
| loss = np.mean((a2 - y) ** 2) | |
| # ----- BACKWARD PASS: gradients (which direction reduces the loss) ----- | |
| # This is the chain rule applied layer by layer ("backpropagation"). | |
| d_a2 = (a2 - y) # how wrong the output is | |
| d_z2 = d_a2 * sigmoid_deriv(a2) # push error through C's activation | |
| d_W2 = a1.T @ d_z2 # gradient for output weights | |
| d_b2 = np.sum(d_z2, axis=0, keepdims=True) | |
| d_a1 = d_z2 @ W2.T # send error back to hidden layer | |
| d_z1 = d_a1 * sigmoid_deriv(a1) # push through A's and B's activations | |
| d_W1 = X.T @ d_z1 # gradient for hidden weights | |
| d_b1 = np.sum(d_z1, axis=0, keepdims=True) | |
| # ----- UPDATE: nudge every parameter a little, opposite the gradient ----- | |
| W2 -= lr * d_W2 | |
| b2 -= lr * d_b2 | |
| W1 -= lr * d_W1 | |
| b1 -= lr * d_b1 | |
| # ----- progress report ----- | |
| if epoch % 2000 == 0: | |
| print(f"epoch {epoch:5d} loss = {loss:.5f}") | |
| # --------------------------------------------------------------------------- | |
| # 5) Results: the discovered parameters and a truth-table check. | |
| # --------------------------------------------------------------------------- | |
| print("\n--- Discovered parameters (the 9 numbers training found) ---") | |
| print("Hidden neuron A: w =", np.round(W1[:, 0], 2), " b =", round(float(b1[0, 0]), 2)) | |
| print("Hidden neuron B: w =", np.round(W1[:, 1], 2), " b =", round(float(b1[0, 1]), 2)) | |
| print("Output neuron C: w =", np.round(W2[:, 0], 2), " b =", round(float(b2[0, 0]), 2)) | |
| print("\n--- Truth table check ---") | |
| z1 = X @ W1 + b1; a1 = sigmoid(z1) | |
| a2 = sigmoid(a1 @ W2 + b2) | |
| for i in range(4): | |
| raw = float(a2[i, 0]) | |
| print(f" input {X[i].astype(int)} -> {raw:.3f} -> rounded {round(raw)} (want {int(y[i,0])})") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment