...]"
)
train_csv = sys.argv[1]
if sys.argv[2].endswith('.csv'):
test_csv = sys.argv[2]
cur_input = 3
else:
test_csv = None
cur_input = 2
num_clusters = int(sys.argv[cur_input])
y_name = sys.argv[cur_input+1]
X_names = sys.argv[cur_input+2:]
y, continuous, categorical, all_cat_names = preprocessCSV(train_csv, y_name, X_names)
# The continuous columns come first, then the categorical
# Transpose so that the first index accesses the row
X = np.array(continuous + categorical).T
## Mostly LLM generated code below
# Initialize and train the MoE model
model = MixtureOfExperts(
k=num_clusters,
n_gauss_features=len(continuous),
max_iters=50,
random_state=42
)
model.fit(X)
print(f"\n--- Final Model Parameters (K={num_clusters}) ---")
print(f"Gating Priors (pi): {model.pi}")
print("\nGaussian Expert Means (mu):")
for k in range(model.k):
print(f" Expert {k}: {model.gauss_mu[k]}")
print("\nCategorical Expert Log-Probabilities:")
# Log-probs are stored as log(P(Category | Expert))
for j in range(model.n_cat):
print(f" Feature {len(continuous) + j} (Category Levels: {model.cat_max_levels[j]}):")
for k in range(model.k):
# Print exponentiated values for human readability (actual probabilities)
print(f" Expert {k} Probabilities: {np.exp(model.cat_log_probs[j][k])}")
print(f" Category names: {all_cat_names[j]}")
# Predict hard cluster assignments for the data points
assignments = model.predict(X)
cluster_common.printClusterStatistics(y, assignments)
if test_csv is not None:
test_y, test_continuous, test_categorical, test_all_cat_names = preprocessCSV(test_csv, y_name, X_names)
X_test = np.array(test_continuous + test_categorical).T
test_assignments = model.predict(X_test)
cluster_common.printClusterStatistics(test_y, test_assignments)
# Get the log likelihood of the test data
gamma, log_likelihood = model._e_step(X_test)
print(f"Test NLL {-log_likelihood}")
if __name__ == "__main__":
main()
```
---
## Statistics
```python
import numpy
# Let's get the statistics
def printClusterStatistics(y, clusters):
"""
Arguments:
y (list): List of class labels
clusters (list[int]): List of assigned clusters.
"""
classes, class_counts = numpy.unique(y, return_counts=True)
num_clusters = len(numpy.unique(clusters))
# Remember which cluster each row was assigned to
cluster_indices = []
for y_idx in range(num_clusters):
cluster_indices.append(numpy.where(clusters==y_idx)[0])
# Assign class labels to the clusters based upon the majority of the class in each cluster
y_hat = ['none' for _ in range(len(y))]
for c_idx in range(num_clusters):
indices = cluster_indices[c_idx]
actual_classes = [y[index] for index in indices]
cluster_classes, cluster_counts = numpy.unique(actual_classes, return_counts=True)
majority_index = numpy.argmax(cluster_counts)
# Mark everything in the cluster with the majority class
for idx in indices:
y_hat[idx] = cluster_classes[majority_index]
def class_match(a, b):
return [i for i in range(len(a)) if a[i] == b[i]]
def class_match_mag(a, b):
return len(class_match(a, b))
accuracy = class_match_mag(y, y_hat)/len(y)
print(f"Accuracy is {accuracy}")
confusion = []
# The columns of the confusion matrix is reality
for real_class in classes:
# The row of the confusion matrix is the prediction
for predicted_class in classes:
total = 0
for i in range(len(y_hat)):
if y_hat[i] == predicted_class and y[i] == real_class:
total += 1
confusion.append(total)
true_positives = confusion[0] + confusion[4] + confusion[8]
false_positives = sum(confusion[1:4]) + sum(confusion[5:8])
precision = true_positives / (true_positives + false_positives)
# Print the confusion matrix
print("Raw confusion")
print(f"{len(y)}\t\t{classes[0]}\t{classes[1]}\t{classes[2]}")
print(f"{classes[0]}\t\t{confusion[0]}\t{confusion[1]}\t\t{confusion[2]}")
print(f"{classes[1]}\t{confusion[3]}\t{confusion[4]}\t\t{confusion[5]}")
print(f"{classes[2]}\t\t{confusion[6]}\t{confusion[7]}\t\t{confusion[8]}")
print("Percentage confusion")
print(f"{len(y)}\t\t{classes[0]}\t{classes[1]}\t{classes[2]}")
print(f"{classes[0]}\t\t{confusion[0]/len(y):.3f}\t{confusion[1]/len(y):.3f}\t\t{confusion[2]/len(y):.3f}")
print(f"{classes[1]}\t{confusion[3]/len(y):.3f}\t{confusion[4]/len(y):.3f}\t\t{confusion[5]/len(y):.3f}")
print(f"{classes[2]}\t\t{confusion[6]/len(y):.3f}\t{confusion[7]/len(y):.3f}\t\t{confusion[8]/len(y):.3f}")
print("The diagonal over the sum of each row is the recall")
print("The diagonal over the sum of each column is the precision")
```
---
## Parameters
* Going to use all of the continuous columns, plus the island
* Let's also try with and without sex
* Going to begin with 3 clusters
---
## Output
Gaussian Expert Means (mu):
Expert 0: [ 38.02292033 17.77439908 187.15744389 3446.58789809]
Expert 1: [ 47.56806336 14.9966457 217.23528564 5092.43770281]
Expert 2: [ 45.26693581 18.85898489 195.82302063 3934.40100864]
Categorical Expert Log-Probabilities:
Feature 4 (Category Levels: 3):
Expert 0 Probabilities: [0.31865682 0.3915926 0.28975057]
Expert 1 Probabilities: [9.99998000e-01 1.00005506e-06 1.00039193e-06]
Expert 2 Probabilities: [0.1129877 0.72483876 0.16217354]
Category names: ['Biscoe', 'Dream', 'Torgersen']
---
## Prediction Results
Accuracy is 0.8258258258258259
Raw confusion
333 Adelie Chinstrap Gentoo
Adelie 93 53 0
Chinstrap 5 63 0
Gentoo 0 0 119
Percentage confusion
333 Adelie Chinstrap Gentoo
Adelie 0.279 0.159 0.000
Chinstrap 0.015 0.189 0.000
Gentoo 0.000 0.000 0.357
---
## Performance
* Adelie island probabilities were 30.1, 37.7, and 32.1. Similar to cluster 0.
* All Gentoo are from Biscoe. Cluster 1 matches.
* All Chinstrap are from Dream. Cluster 2 is close.
* Better than k-means (72%) but we can do better
* Maybe we've made a bad assumption
* Let's add in the sex column
---
## New results
Gating Priors (pi): [0.3287228 0.3573577 0.3139195]
Gaussian Expert Means (mu):
Expert 0: [ 40.10380632 17.57828757 188.68964113 3414.26111317]
Expert 1: [ 47.56805878 14.99664074 217.23527266 5092.43587451]
Expert 2: [ 43.99517973 19.20019548 195.30382982 4029.34630019]
Categorical Expert Log-Probabilities:
Feature 4 (Category Levels: 3):
Expert 0 Probabilities: [0.20989062 0.56509639 0.225013 ]
Expert 1 Probabilities: [9.99998000e-01 1.00015477e-06 1.00018852e-06]
Expert 2 Probabilities: [0.20112228 0.58489147 0.21398625]
Category names: ['Biscoe', 'Dream', 'Torgersen']
Feature 5 (Category Levels: 2):
Expert 0 Probabilities: [0.94314441 0.05685559]
Expert 1 Probabilities: [0.48739524 0.51260476]
Expert 2 Probabilities: [0.03595822 0.96404178]
Category names: ['female', 'male']
Accuracy is 0.7957957957957958
---
## Statistics
Raw confusion
333 Adelie Chinstrap Gentoo
Adelie 146 0 0
Chinstrap 68 0 0
Gentoo 0 0 119
Percentage confusion
333 Adelie Chinstrap Gentoo
Adelie 0.438 0.000 0.000
Chinstrap 0.204 0.000 0.000
Gentoo 0.000 0.000 0.357
---
## Discussion
* Even worse!
* But look at the cluster probabilities for sex
* Only one cluster is an even mix
* Of the other two, one is all female and the other is all male
---
## Cluster Count
* Perhaps the number of clusters is wrong?
* Intuitively, the male and females of a species could have large differences
* If we are starving for experts, some of them will make bad estimates
* We could say that this model is under parameterized, and is underfitting
---
## Increasing Clusters
---
## 6 Clusters, all categories
Categorical Expert Log-Probabilities:
Feature 4 (Category Levels: 3):
Expert 0 Probabilities: [0.29060101 0.37712167 0.33227733]
Expert 1 Probabilities: [1.03474350e-06 9.99997860e-01 1.10506899e-06]
Expert 2 Probabilities: [0.21412767 0.31222466 0.47364767]
Expert 3 Probabilities: [9.99997999e-01 1.00002301e-06 1.00100198e-06]
Expert 4 Probabilities: [0.4195082 0.45302936 0.12746244]
Expert 5 Probabilities: [1.00012925e-06 9.99997999e-01 1.00069388e-06]
Category names: ['Biscoe', 'Dream', 'Torgersen']
Feature 5 (Category Levels: 2):
Expert 0 Probabilities: [9.99998715e-01 1.28523896e-06]
Expert 1 Probabilities: [0.95618676 0.04381324]
Expert 2 Probabilities: [2.07100284e-06 9.99997929e-01]
Expert 3 Probabilities: [0.48739527 0.51260473]
Expert 4 Probabilities: [0.04048937 0.95951063]
Expert 5 Probabilities: [1.65357340e-06 9.99998346e-01]
Category names: ['female', 'male']
Accuracy is 0.993993993993994
---
## Statistics
Raw confusion
333 Adelie Chinstrap Gentoo
Adelie 145 1 0
Chinstrap 1 67 0
Gentoo 0 0 119
Percentage confusion
333 Adelie Chinstrap Gentoo
Adelie 0.435 0.003 0.000
Chinstrap 0.003 0.201 0.000
Gentoo 0.000 0.000 0.357
---
## Real Progress
* This is far better than k-means, which tops out at 72% accuracy from 3-20 clusters
* Even without any categories we get nearly 99% accuracy
* This is because we look at covariance now
* Adding in both categories does the best, with 2 incorrect
* But is poor until we have enough clusters to fit the data
---
## Choosing Cluster Numbers
* Here we could look at the probabilities and see something was wrong
* The final result had clusters that were male or female
* With too few clusters the populations were clearly under fit
* Without labels though, how would we know when to stop adding clusters?
* We should look at the final NLL
---
## Rookie Mistake
* You need a testing set, or the error will keep going down forever
* Even with unsupervised learning, it's still important to keep a holdout set
* We can re-use the randomized penguin train and test set from an earlier class
---
## Final NLL
---
## Test Predictions
Accuracy is 1.0
Raw confusion
33 Adelie Chinstrap Gentoo
Adelie 16 0 0
Chinstrap 0 6 0
Gentoo 0 0 11
Percentage confusion
33 Adelie Chinstrap Gentoo
Adelie 0.485 0.000 0.000
Chinstrap 0.000 0.182 0.000
Gentoo 0.000 0.000 0.333
The diagonal over the sum of each row is the recall
The diagonal over the sum of each column is the precision
Test NLL 414.0106456498475
---
## Final Thoughts
* If columns are correlated, then there are diminishing returns with more features
* That implies that we should be able to reduce columns into "meta columns", distilling features to components
* Next time!