#!/usr/bin/env python3
"""
Neural Network that learns from sentences (sentiment).
- Has a small labeled dataset of sentences: positive (1) vs negative (0).
- Builds a simple word-level vocabulary.
- Converts each sentence to a bag-of-words vector.
- Trains a 2-layer neural network:
input (vocab_size) -> hidden (8, ReLU) -> output (1, sigmoid)
- Prints training loss and tests on a few new sentences.
"""
import numpy as np
import re
from typing import List, Dict, Tuple
# -----------------------
# Sentiment dataset
# -----------------------
def sentiment_dataset() -> Tuple[List[str], np.ndarray]:
"""
Returns:
sentences: list of sentences (str)
labels: numpy array of shape (N, 1), 1 = positive, 0 = negative
"""
positive_sentences = [
"I love this movie",
"This is a great product",
"What a fantastic experience",
"I am very happy today",
"The food was amazing",
"I really enjoyed this",
"This is absolutely wonderful",
]
negative_sentences = [
"I hate this movie",
"This is a terrible product",
"What a bad experience",
"I am very sad today",
"The food was awful",
"I really disliked this",
"This is absolutely horrible",
]
sentences = positive_sentences + negative_sentences
labels = np.array(
[1] * len(positive_sentences) + [0] * len(negative_sentences),
dtype=np.float32
).reshape(-1, 1)
return sentences, labels
# -----------------------
# Text preprocessing
# -----------------------
def tokenize(text: str) -> List[str]:
"""
Very simple tokenizer:
- Lowercase
- Remove non-letter characters except spaces
- Split on whitespace
"""
text = text.lower()
text = re.sub(r"[^a-z\s]", "", text) # keep letters and spaces
tokens = text.split()
return tokens
def build_vocab(sentences: List[str], min_freq: int = 1) -> Dict[str, int]:
"""
Builds a word -> index vocabulary from the list of sentences.
min_freq: minimum word frequency to keep in the vocab
"""
freq = {}
for s in sentences:
for tok in tokenize(s):
freq[tok] = freq.get(tok, 0) + 1
# Reserve index 0 for unknown tokens (UNK)
word2idx = {"": 0}
idx = 1
for word, count in sorted(freq.items()):
if count >= min_freq:
word2idx[word] = idx
idx += 1
return word2idx
def sentence_to_bow(sentence: str, word2idx: Dict[str, int]) -> np.ndarray:
"""
Convert a sentence into a bag-of-words vector of shape (vocab_size,).
- Bag-of-words: counts how many times each word appears (order is ignored).
"""
tokens = tokenize(sentence)
vec = np.zeros(len(word2idx), dtype=np.float32)
for tok in tokens:
idx = word2idx.get(tok, 0) # 0 is
vec[idx] += 1.0
return vec
def vectorize_sentences(sentences: List[str], word2idx: Dict[str, int]) -> np.ndarray:
"""
Convert a list of sentences into a matrix X of shape (N, vocab_size).
"""
X = np.stack([sentence_to_bow(s, word2idx) for s in sentences], axis=0)
return X
# -----------------------
# Neural Network
# -----------------------
class SimpleNeuralNet:
"""
Small fully-connected neural network:
input_dim -> hidden_dim (ReLU) -> 1 (sigmoid)
Implemented from scratch with NumPy for educational purposes.
"""
def __init__(self, input_dim: int, hidden_dim: int = 8, seed: int = 0):
rng = np.random.default_rng(seed)
# Xavier-like initialization (scaled)
self.W1 = rng.normal(0.0, 1.0, size=(input_dim, hidden_dim)) * np.sqrt(2 / input_dim)
self.b1 = np.zeros((1, hidden_dim), dtype=np.float32)
self.W2 = rng.normal(0.0, 1.0, size=(hidden_dim, 1)) * np.sqrt(2 / hidden_dim)
self.b2 = np.zeros((1, 1), dtype=np.float32)
@staticmethod
def _relu(z):
return np.maximum(0, z)
@staticmethod
def _relu_grad(z):
return (z > 0).astype(z.dtype)
@staticmethod
def _sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def forward(self, X: np.ndarray):
"""
Forward pass. Returns:
- y_hat: predicted probabilities, shape (N, 1)
- cache: intermediate values for backprop
"""
z1 = X @ self.W1 + self.b1 # (N, hidden_dim)
a1 = self._relu(z1)
z2 = a1 @ self.W2 + self.b2 # (N, 1)
y_hat = self._sigmoid(z2)
cache = {"X": X, "z1": z1, "a1": a1, "z2": z2, "y_hat": y_hat}
return y_hat, cache
@staticmethod
def binary_cross_entropy(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-8) -> float:
"""
Binary cross-entropy loss.
y_true: shape (N, 1), values 0 or 1
y_pred: shape (N, 1), values in (0, 1)
"""
y_pred = np.clip(y_pred, eps, 1 - eps)
loss = -np.mean(
y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
)
return float(loss)
def backward(self, cache, y_true: np.ndarray):
"""
Backpropagation to compute gradients.
Returns a dict: dW1, db1, dW2, db2
"""
X = cache["X"]
z1 = cache["z1"]
a1 = cache["a1"]
y_hat = cache["y_hat"]
N = X.shape[0]
# dL/dz2 for sigmoid + BCE
dz2 = (y_hat - y_true) / N # (N, 1)
dW2 = a1.T @ dz2 # (hidden_dim, 1)
db2 = np.sum(dz2, axis=0, keepdims=True)
da1 = dz2 @ self.W2.T # (N, hidden_dim)
dz1 = da1 * self._relu_grad(z1)
dW1 = X.T @ dz1 # (input_dim, hidden_dim)
db1 = np.sum(dz1, axis=0, keepdims=True)
return {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}
def update_params(self, grads, lr: float = 0.1):
"""
Gradient descent parameter update.
"""
self.W1 -= lr * grads["dW1"]
self.b1 -= lr * grads["db1"]
self.W2 -= lr * grads["dW2"]
self.b2 -= lr * grads["db2"]
def predict_proba(self, X: np.ndarray) -> np.ndarray:
"""
Predict probabilities for X.
"""
y_hat, _ = self.forward(X)
return y_hat
def predict(self, X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
"""
Predict binary labels (0 or 1) for X.
"""
probs = self.predict_proba(X)
return (probs >= threshold).astype(int)
# -----------------------
# Training demo
# -----------------------
def train_text_nn():
# 1) Build dataset
sentences, labels = sentiment_dataset()
word2idx = build_vocab(sentences, min_freq=1)
X = vectorize_sentences(sentences, word2idx) # (N, vocab_size)
y = labels # (N, 1)
print(f"Number of sentences: {len(sentences)}")
print(f"Vocabulary size: {len(word2idx)}")
print(f"Example vocab items: {list(word2idx.items())[:10]}")
print()
# 2) Initialize neural network
input_dim = X.shape[1]
nn = SimpleNeuralNet(input_dim=input_dim, hidden_dim=8, seed=42)
# 3) Train
epochs = 1500
lr = 0.1
for epoch in range(1, epochs + 1):
y_hat, cache = nn.forward(X)
loss = nn.binary_cross_entropy(y, y_hat)
grads = nn.backward(cache, y)
nn.update_params(grads, lr)
if epoch % 150 == 0 or epoch == 1:
print(f"Epoch {epoch:4d}/{epochs}, loss = {loss:.4f}")
# 4) Evaluate on training set
y_pred = nn.predict(X)
acc = np.mean(y_pred == y)
print(f"\nTraining accuracy: {acc * 100:.2f}%")
# 5) Try some new sentences
test_sentences = [
"I really love this",
"This food is bad",
"What a wonderful experience",
"I am unhappy today",
"This is terrible",
"This is great",
]
X_test = vectorize_sentences(test_sentences, word2idx)
probs = nn.predict_proba(X_test)
preds = (probs >= 0.5).astype(int)
print("\nTest sentences:")
for s, p, pr in zip(test_sentences, preds, probs):
label = "positive" if p[0] == 1 else "negative"
print(f" '{s}' -> {label} (prob={pr[0]:.3f})")
if __name__ == "__main__":
train_text_nn()
#--------------------------------------------------------------------------------#
# Number of sentences: 14
# Vocabulary size: 31
#
# Example vocab items:
[
('', 0), ('a', 1), ('absolutely', 2), ('am', 3), ('amazing', 4),
('awful', 5), ('bad', 6), ('disliked', 7), ('enjoyed', 8), ('experience', 9)
]
#
# Epoch 1/1500, loss = 0.6875
# Epoch 150/1500, loss = 0.2697
# Epoch 300/1500, loss = 0.0904
# Epoch 450/1500, loss = 0.0378
# Epoch 600/1500, loss = 0.0211
# Epoch 750/1500, loss = 0.0139
# Epoch 900/1500, loss = 0.0101
# Epoch 1050/1500, loss = 0.0078
# Epoch 1200/1500, loss = 0.0063
# Epoch 1350/1500, loss = 0.0052
# Epoch 1500/1500, loss = 0.0044
#
# Training accuracy: 100.00%
#
# Test sentences:
# 'I really love this' -> positive (prob=0.997)
# 'This food is bad' -> negative (prob=0.002)
# 'What a wonderful experience' -> positive (prob=0.990)
# 'I am unhappy today' -> positive (prob=0.685)
# 'This is terrible' -> negative (prob=0.001)
# 'This is great' -> positive (prob=0.965)
#--------------------------------------------------------------------------------#
An embedding layer is a lookup table that maps integer token IDs to dense vectors of a fixed dimension (typically 50–300 for small models). Internally it's a (vocab_size × embedding_dim) weight matrix; given token ID 42 it returns row 42. The weights are learned via backprop just like any other layer — tokens that play similar roles in the loss end up with similar vectors. Embeddings can be initialized from pretrained vectors (GloVe, word2vec) and either frozen or fine-tuned, which boosts data-efficient training because the layer starts with useful structure instead of random noise.
Sentences have variable length but tensor batches need fixed shape, so shorter sequences are padded to the max length in the batch (or a global max) with a reserved PAD token (usually ID 0). The trick: the model must ignore PAD tokens or it learns spurious patterns from them. In PyTorch you pass an attention_mask to the LSTM/Transformer (or use pack_padded_sequence for RNNs); in Keras you set mask_zero=True on the Embedding layer. The loss function should also mask PAD positions so they don't contribute to the gradient.
Tokenize text and convert to padded ID sequences. For each batch: forward pass embeddings through the model (CNN/LSTM/Transformer) to get sequence representations, pool to a single vector (mean, max, or CLS-token), pass through a final dense + sigmoid for binary classification. Compute BCE loss against the gold label, backprop, and step the optimizer (Adam with lr ~1e-3 for from-scratch, ~2e-5 for fine-tuning a pretrained model). Track validation accuracy and loss per epoch; stop when val loss plateaus or starts climbing (early stopping). Class imbalance? Weight the loss or oversample the minority class.
CNNs (1D convolutions over embeddings) are the fastest and surprisingly effective for short-text sentiment because they capture local n-gram patterns ("not good", "really bad") with shared filters. LSTMs handle longer-range dependencies better and were the standard before Transformers, but they're sequential (slow to train, hard to parallelize). Transformers (or pretrained encoders like BERT/DistilBERT/RoBERTa) win on accuracy almost everywhere now — they handle long range, parallelize on GPU, and benefit from massive pretraining. In 2026 the practical choice for sentiment is "fine-tune DistilBERT" unless you have latency constraints that force a smaller model.
Negation flips polarity ("not good" should be negative) but bag-of-words and small CNNs treat "not" and "good" as independent positive signals. LSTMs and Transformers handle it better because they see the context, but they still struggle when negation is far from the modified word ("the food, despite what every other reviewer says, was not good"). Sarcasm is harder because the surface vocabulary is positive while the intent is negative — you need world knowledge or a large pretrained model with diverse exposure. The honest answer for production: collect a domain-specific labeled set of these hard cases and fine-tune; no architecture solves it for free.
Almost always distribution shift. Check: is the production text from the same domain (movie reviews vs product reviews vs tweets)? Different vocabulary, different conventions. Is preprocessing identical (lowercasing, tokenizer, max length)? A subtle mismatch silently breaks things. Is class balance the same? A model trained on 50/50 fails when production is 90/10. The diagnostic: dump 100 production failures, label them by hand, and bucket the failure modes — usually two or three buckets cover 80% and each has a clear fix (more training data of that type, a preprocessing change, or a confidence-threshold adjustment).