K-Means Clustering (MSSC)¶
Principles learned¶
Create a set decision variable
Use the operator “count” to get the number of elements in a set
Use a lambda expression to compute a sum on a set
Use ternary conditions
Problem¶
Given a sample of observations along some dimensions, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.
Download the exampleData¶
The format of the data is as follows:
1st line: number of observations and number of dimensions
For each observation: coordinate along each dimension, and the actual cluster it belongs to
Program¶
The model implemented here makes use of set variables. For every cluster, we define a set which describes the observations assigned to that cluster. Those sets are constrained to form a partition, which means that an observation must be assigned to exactly one cluster. For each cluster, we compute the centroid of the observations in the cluster, from which we can obtain the variance of the cluster. The variance of a cluster is defined as the sum of the respective squared euclidian distances between the centroid and every element of the cluster. The objective is to minimize the sum of these variances.
- Execution:
- hexaly kmeans.hxm inFileName=instances/iris.dat [hxTimeLimit=] [solFileName=] [k=val]
use io;
/* Read instance data */
function input() {
usage = "Usage: hexaly kmeans.hxm inFileName=inputFile "
+ "[solFileName=outputFile] [hxTimeLimit=timeLimit] [k=value]";
if (inFileName == nil) throw usage;
local f = io.openRead(inFileName);
nbObservations = f.readInt();
nbDimensions = f.readInt();
if (k == nil)
k = 2;
for [o in 0...nbObservations] {
coordinates[o][d in 0...nbDimensions] = f.readDouble();
f.readString(); // skip initial clusters
}
}
/* Declare the optimization model */
function model() {
// Set decisions: clusters[c] represents the points in cluster c
clusters[c in 0...k] <- set(nbObservations);
// Each point must be in one cluster and one cluster only
constraint partition[c in 0...k](clusters[c]);
// Compute variances
for [c in 0...k] {
local cluster <- clusters[c];
local size <- count(cluster);
// Compute the centroid of the cluster
centroid[d in 0...nbDimensions] <- size == 0 ? 0 :
sum(cluster, o => coordinates[o][d]) / size;
// Compute the variance of the cluster
squares[d in 0...nbDimensions] <- sum(cluster,
o => pow(coordinates[o][d] - centroid[d], 2));
variances[c] <- sum[d in 0...nbDimensions](squares[d]);
}
// Minimize the total variance
obj <- sum[c in 0...k](variances[c]);
minimize obj;
}
/* Parametrize the solver */
function param() {
if (hxTimeLimit == nil) hxTimeLimit = 5;
}
/* Write the solution in a file in the following format:
* - objective value
* - k
* - for each cluster, a line with the elements in the cluster (separated by spaces) */
function output() {
if (solFileName == nil) return;
local solFile = io.openWrite(solFileName);
solFile.println(obj.value);
solFile.println(k);
for [c in 0...k] {
for [o in clusters[c].value]
solFile.print(o + " ");
solFile.println();
}
}
- Execution (Windows)
- set PYTHONPATH=%HX_HOME%\bin\pythonpython kmeans.py instances\iris.dat
- Execution (Linux)
- export PYTHONPATH=/opt/hexaly_13_0/bin/pythonpython kmeans.py instances/iris.dat
import hexaly.optimizer
import sys
def read_elem(filename):
with open(filename) as f:
return [str(elem) for elem in f.read().split()]
#
# Read instance data
#
def read_instance(filename):
file_it = iter(read_elem(filename))
# Data properties
nb_observations = int(next(file_it))
nb_dimensions = int(next(file_it))
coordinates_data = [None] * nb_observations
for o in range(nb_observations):
coordinates_data[o] = [None] * (nb_dimensions)
for d in range(nb_dimensions):
coordinates_data[o][d] = float(next(file_it))
next(file_it) # skip initial clusters
return nb_observations, nb_dimensions, coordinates_data
def main(instance_file, output_file, time_limit, k):
nb_observations, nb_dimensions, coordinates_data = read_instance(instance_file)
with hexaly.optimizer.HexalyOptimizer() as optimizer:
#
# Declare the optimization model
#
model = optimizer.model
# clusters[c] represents the points in cluster c
clusters = [model.set(nb_observations) for c in range(k)]
# Each point must be in one cluster and one cluster only
model.constraint(model.partition(clusters))
# Coordinates of points
coordinates = model.array(coordinates_data)
# Compute variances
variances = []
for cluster in clusters:
size = model.count(cluster)
# Compute centroid of cluster
centroid = [0 for d in range(nb_dimensions)]
for d in range(nb_dimensions):
coordinate_lambda = model.lambda_function(
lambda i: model.at(coordinates, i, d))
centroid[d] = model.iif(
size == 0,
0,
model.sum(cluster, coordinate_lambda) / size)
# Compute variance of cluster
variance = model.sum()
for d in range(nb_dimensions):
dimension_variance_lambda = model.lambda_function(lambda i: model.sum(
model.pow(model.at(coordinates, i, d) - centroid[d], 2)))
dimension_variance = model.sum(cluster, dimension_variance_lambda)
variance.add_operand(dimension_variance)
variances.append(variance)
# Minimize the total variance
obj = model.sum(variances)
model.minimize(obj)
model.close()
# Parameterize the optimizer
optimizer.param.time_limit = time_limit
optimizer.solve()
#
# Write the solution in a file in the following format:
# - objective value
# - k
# - for each cluster, a line with the elements in the cluster
# (separated by spaces)
#
if output_file != None:
with open(output_file, 'w') as f:
f.write("%f\n" % obj.value)
f.write("%d\n" % k)
for c in range(k):
for o in clusters[c].value:
f.write("%d " % o)
f.write("\n")
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
sys.exit(1)
instance_file = sys.argv[1]
output_file = sys.argv[2] if len(sys.argv) >= 3 else None
time_limit = int(sys.argv[3]) if len(sys.argv) >= 4 else 60
k = int(sys.argv[4]) if len(sys.argv) >= 5 else 2
main(instance_file, output_file, time_limit, k)
- Compilation / Execution (Windows)
- cl /EHsc kmeans.cpp -I%HX_HOME%\include /link %HX_HOME%\bin\hexaly130.libkmeans instances\iris.dat
- Compilation / Execution (Linux)
- g++ kmeans.cpp -I/opt/hexaly_13_0/include -lhexaly130 -lpthread -o kmeans./kmeans instances/iris.dat
#include "optimizer/hexalyoptimizer.h"
#include <fstream>
#include <iostream>
#include <limits>
#include <vector>
using namespace hexaly;
using namespace std;
class Kmeans {
public:
// Data properties
int nbObservations;
int nbDimensions;
int k;
vector<vector<double>> coordinatesData;
// Hexaly Optimizer
HexalyOptimizer optimizer;
// Decisions
vector<HxExpression> clusters;
// Objective
HxExpression obj;
Kmeans(int k) : k(k) {}
// Read instance data
void readInstance(const string& fileName) {
ifstream infile;
infile.exceptions(ifstream::failbit | ifstream::badbit);
infile.open(fileName.c_str());
infile >> nbObservations;
infile >> nbDimensions;
coordinatesData.resize(nbObservations);
string tmp;
for (int o = 0; o < nbObservations; ++o) {
coordinatesData[o].resize(nbDimensions);
for (int d = 0; d < nbDimensions; ++d) {
infile >> coordinatesData[o][d];
}
infile >> tmp; // skip initial clusters
}
}
void solve(int limit) {
// Declare the optimization model
HxModel model = optimizer.getModel();
// Set decisions: clusters[c] represents the points in cluster c
clusters.resize(k);
for (int c = 0; c < k; ++c) {
clusters[c] = model.setVar(nbObservations);
}
// Each point must be in one cluster and one cluster only
model.constraint(model.partition(clusters.begin(), clusters.end()));
// Coordinates of points
HxExpression coordinates = model.array();
for (int o = 0; o < nbObservations; ++o) {
coordinates.addOperand(model.array(coordinatesData[o].begin(), coordinatesData[o].end()));
}
// Compute variances
vector<HxExpression> variances;
variances.resize(k);
for (int c = 0; c < k; ++c) {
HxExpression cluster = clusters[c];
HxExpression size = model.count(cluster);
// Compute the centroid of the cluster
HxExpression centroid = model.array();
for (int d = 0; d < nbDimensions; ++d) {
HxExpression coordinateLambda =
model.createLambdaFunction([&](HxExpression o) { return model.at(coordinates, o, d); });
centroid.addOperand(model.iif(size == 0, 0, model.sum(cluster, coordinateLambda) / size));
}
// Compute the variance of the cluster
HxExpression variance = model.sum();
for (int d = 0; d < nbDimensions; ++d) {
HxExpression dimensionVarianceLambda = model.createLambdaFunction(
[&](HxExpression o) { return model.pow(model.at(coordinates, o, d) - model.at(centroid, d), 2); });
HxExpression dimensionVariance = model.sum(cluster, dimensionVarianceLambda);
variance.addOperand(dimensionVariance);
}
variances[c] = variance;
}
// Minimize the total variance
obj = model.sum(variances.begin(), variances.end());
model.minimize(obj);
model.close();
// Parametrize the optimizer
optimizer.getParam().setTimeLimit(limit);
optimizer.solve();
}
/* Write the solution in a file in the following format:
* - objective value
* - k
* - for each cluster, a line with the elements in the cluster (separated by spaces) */
void writeSolution(const string& fileName) {
ofstream outfile;
outfile.exceptions(ofstream::failbit | ofstream::badbit);
outfile.open(fileName.c_str());
outfile << obj.getDoubleValue() << endl;
outfile << k << endl;
for (int c = 0; c < k; ++c) {
HxCollection clusterCollection = clusters[c].getCollectionValue();
for (int i = 0; i < clusterCollection.count(); ++i) {
outfile << clusterCollection[i] << " ";
}
outfile << endl;
}
}
};
int main(int argc, char** argv) {
if (argc < 2) {
cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
return 1;
}
const char* instanceFile = argv[1];
const char* solFile = argc > 2 ? argv[2] : NULL;
const char* strTimeLimit = argc > 3 ? argv[3] : "5";
const char* k = argc > 4 ? argv[4] : "2";
try {
Kmeans model(atoi(k));
model.readInstance(instanceFile);
model.solve(atoi(strTimeLimit));
if (solFile != NULL)
model.writeSolution(solFile);
return 0;
} catch (const exception& e) {
cerr << "An error occurred: " << e.what() << endl;
return 1;
}
}
- Compilation / Execution (Windows)
- copy %HX_HOME%\bin\Hexaly.NET.dll .csc Kmeans.cs /reference:Hexaly.NET.dllKmeans instances\iris.dat
using System;
using System.IO;
using System.Globalization;
using Hexaly.Optimizer;
public class Kmeans : IDisposable
{
// Data properties
int nbObservations;
int nbDimensions;
int k;
double[][] coordinatesData;
// Hexaly Optimizer
HexalyOptimizer optimizer;
// Decisions
HxExpression[] clusters;
// Objective
HxExpression obj;
public Kmeans(int k)
{
optimizer = new HexalyOptimizer();
this.k = k;
}
// Read instance data
public void ReadInstance(string fileName)
{
using (StreamReader input = new StreamReader(fileName))
{
string[] splittedLine = input.ReadLine().Split();
nbObservations = int.Parse(splittedLine[0]);
nbDimensions = int.Parse(splittedLine[1]);
coordinatesData = new double[nbObservations][];
for (int o = 0; o < nbObservations; ++o)
{
splittedLine = input.ReadLine().Split();
coordinatesData[o] = new double[nbDimensions];
for (int d = 0; d < nbDimensions; ++d)
coordinatesData[o][d] = double.Parse(
splittedLine[d],
CultureInfo.InvariantCulture
);
}
}
}
public void Dispose()
{
if (optimizer != null)
optimizer.Dispose();
}
public void Solve(int limit)
{
// Declare the optimization model
HxModel model = optimizer.GetModel();
// Set decisions: clusters[c] represents the points in cluster c
clusters = new HxExpression[k];
for (int c = 0; c < k; ++c)
clusters[c] = model.Set(nbObservations);
// Each point must be in one cluster and one cluster only
model.Constraint(model.Partition(clusters));
// Coordinates of points
HxExpression coordinates = model.Array(coordinatesData);
// Compute variances
HxExpression[] variances = new HxExpression[k];
for (int c = 0; c < k; ++c)
{
HxExpression cluster = clusters[c];
HxExpression size = model.Count(cluster);
// Compute the centroid of the cluster
HxExpression centroid = model.Array();
for (int d = 0; d < nbDimensions; ++d)
{
HxExpression coordinateLambda = model.LambdaFunction(
o => model.At(coordinates, o, model.CreateConstant(d))
);
centroid.AddOperand(
model.If(size == 0, 0, model.Sum(cluster, coordinateLambda) / size)
);
}
// Compute the variance of the cluster
HxExpression variance = model.Sum();
for (int d = 0; d < nbDimensions; ++d)
{
HxExpression dimensionVarianceLambda = model.LambdaFunction(
o =>
model.Pow(
model.At(coordinates, o, model.CreateConstant(d))
- model.At(centroid, model.CreateConstant(d)),
2
)
);
HxExpression dimensionVariance = model.Sum(cluster, dimensionVarianceLambda);
variance.AddOperand(dimensionVariance);
}
variances[c] = variance;
}
// Minimize the total variance
obj = model.Sum(variances);
model.Minimize(obj);
model.Close();
// Parametrize the optimizer
optimizer.GetParam().SetTimeLimit(limit);
optimizer.Solve();
}
/* Write the solution in a file in the following format:
* - objective value
* - k
* - for each cluster, a line with the elements in the cluster (separated by spaces) */
public void WriteSolution(string fileName)
{
using (StreamWriter output = new StreamWriter(fileName))
{
output.WriteLine(obj.GetDoubleValue());
output.WriteLine(k);
for (int c = 0; c < k; ++c)
{
HxCollection clusterCollection = clusters[c].GetCollectionValue();
for (int i = 0; i < clusterCollection.Count(); ++i)
output.Write(clusterCollection[i] + " ");
output.WriteLine();
}
output.Close();
}
}
public static void Main(string[] args)
{
if (args.Length < 1)
{
Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
Environment.Exit(1);
}
string instanceFile = args[0];
string outputFile = args.Length > 1 ? args[1] : null;
string strTimeLimit = args.Length > 2 ? args[2] : "5";
string k = args.Length > 3 ? args[3] : "2";
using (Kmeans model = new Kmeans(int.Parse(k)))
{
model.ReadInstance(instanceFile);
model.Solve(int.Parse(strTimeLimit));
if (outputFile != null)
model.WriteSolution(outputFile);
}
}
}
- Compilation / Execution (Windows)
- javac Kmeans.java -cp %HX_HOME%\bin\hexaly.jarjava -cp %HX_HOME%\bin\hexaly.jar;. Kmeans instances\iris.dat
- Compilation / Execution (Linux)
- javac Kmeans.java -cp /opt/hexaly_13_0/bin/hexaly.jarjava -cp /opt/hexaly_13_0/bin/hexaly.jar:. Kmeans instances/iris.dat
import java.util.*;
import java.io.*;
import com.hexaly.optimizer.*;
public class Kmeans {
// Data properties
private int nbObservations;
private int nbDimensions;
private int k;
private double[][] coordinatesData;
// Hexaly Optimizer
private final HexalyOptimizer optimizer;
// Decisions
private HxExpression[] clusters;
// Objective
private HxExpression obj;
private Kmeans(HexalyOptimizer optimizer) {
this.optimizer = optimizer;
}
// Read instance data
private void readInstance(int k, String fileName) throws IOException {
try (Scanner input = new Scanner(new File(fileName))) {
input.useLocale(Locale.ROOT);
nbObservations = input.nextInt();
nbDimensions = input.nextInt();
this.k = k;
coordinatesData = new double[nbObservations][nbDimensions];
for (int o = 0; o < nbObservations; ++o) {
for (int d = 0; d < nbDimensions; ++d) {
coordinatesData[o][d] = input.nextDouble();
}
input.next(); // skip initial clusters
}
}
}
private void solve(int limit) {
// Declare the optimization model
HxModel model = optimizer.getModel();
// Set decisions: clusters[c] represents the points in cluster c
clusters = new HxExpression[k];
for (int c = 0; c < k; ++c) {
clusters[c] = model.setVar(nbObservations);
}
// Each point must be in one cluster and one cluster only
model.constraint(model.partition(clusters));
// Coordinates of points
HxExpression coordinates = model.array(coordinatesData);
// Compute variances
HxExpression[] variances = new HxExpression[k];
for (int c = 0; c < k; ++c) {
HxExpression cluster = clusters[c];
HxExpression size = model.count(cluster);
// Compute the centroid of the cluster
HxExpression centroid = model.array();
for (int d = 0; d < nbDimensions; ++d) {
HxExpression dExpr = model.createConstant(d);
HxExpression coordinateLambda = model.lambdaFunction(o -> model.at(coordinates, o, dExpr));
centroid
.addOperand(model.iif(model.eq(size, 0), 0, model.div(model.sum(cluster, coordinateLambda), size)));
}
// Compute the variance of the cluster
HxExpression variance = model.sum();
for (int d = 0; d < nbDimensions; ++d) {
HxExpression dExpr = model.createConstant(d);
HxExpression dimensionVarianceLambda = model.lambdaFunction(
o -> model.pow(model.sub(model.at(coordinates, o, dExpr), model.at(centroid, dExpr)), 2));
HxExpression dimensionVariance = model.sum(cluster, dimensionVarianceLambda);
variance.addOperand(dimensionVariance);
}
variances[c] = variance;
}
// Minimize the total variance
obj = model.sum(variances);
model.minimize(obj);
model.close();
// Parametrize the optimizer
optimizer.getParam().setTimeLimit(limit);
optimizer.solve();
}
/*
* Write the solution in a file in the following format:
* - objective value
* - k
* - for each cluster, a line with the elements in the cluster (separated by
* spaces)
*/
private void writeSolution(String fileName) throws IOException {
try (PrintWriter output = new PrintWriter(fileName)) {
output.println(obj.getDoubleValue());
output.println(k);
for (int c = 0; c < k; ++c) {
HxCollection clusterCollection = clusters[c].getCollectionValue();
for (int i = 0; i < clusterCollection.count(); ++i) {
output.print(clusterCollection.get(i) + " ");
}
output.println();
}
}
}
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
System.exit(1);
}
String instanceFile = args[0];
String outputFile = args.length > 1 ? args[1] : null;
String strTimeLimit = args.length > 2 ? args[2] : "5";
String k = args.length > 3 ? args[3] : "2";
try (HexalyOptimizer optimizer = new HexalyOptimizer()) {
Kmeans model = new Kmeans(optimizer);
model.readInstance(Integer.parseInt(k), instanceFile);
model.solve(Integer.parseInt(strTimeLimit));
if (outputFile != null) {
model.writeSolution(outputFile);
}
} catch (Exception ex) {
System.err.println(ex);
ex.printStackTrace();
System.exit(1);
}
}
}