K-Means Clustering Problem (MSSC)

Problem

In the K-Means Clustering Problem, or Minimum Sum-of-Squares Clustering (MSSC), we want to partition a set of multidimensional observations into k clusters, defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.

Principles learned

Data

The format of the data is as follows:

  • 1st line: number of observations and number of dimensions for each observation
  • For each observation: coordinate along each dimension, and the cluster it belongs to in the optimal solution

Program

The Hexaly model for the K-Means Clustering Problem (MSSC) uses set decision variables. Each set variable represents a cluster, and the elements inside it represent the observations belonging to this cluster. We use a ‘partition’ operator to ensure that each observation belongs to exactly one cluster.

We compute the centroid of each cluster using a lambda function to apply the ‘sum’ operator over the coordinates of all the observations in this cluster, for all dimensions. Note that the number of terms in this sum varies during the search, along with the size of the set.

We can then compute the total variance. A cluster’s variance is the sum of squared Euclidian distances between the centroid and every observation in the cluster. As for the centroids, we compute each cluster’s variance using a lambda function. The objective is to minimize the sum of these variances.

Execution
hexaly kmeans.hxm inFileName=instances/iris.dat [hxTimeLimit=] [solFileName=] [k=val]
use io;

/* Read instance data */
function input() {
    usage = "Usage: hexaly kmeans.hxm inFileName=inputFile "
            + "[solFileName=outputFile] [hxTimeLimit=timeLimit] [k=value]";

    if (inFileName == nil) throw usage;
    local f = io.openRead(inFileName);
    nbObservations = f.readInt();
    nbDimensions = f.readInt();

    if (k == nil)
        k = 2;

    for [o in 0...nbObservations] {
        coordinates[o][d in 0...nbDimensions] = f.readDouble();
        f.readString(); // skip initial clusters
    }
}

/* Declare the optimization model */
function model() {
    // Set decisions: clusters[c] represents the points in cluster c
    clusters[c in 0...k] <- set(nbObservations);

    // Each point must be in one cluster and one cluster only
    constraint partition[c in 0...k](clusters[c]);

    // Compute variances
    for [c in 0...k] {
        local cluster <- clusters[c];
        local size <- count(cluster);

        // Compute the centroid of the cluster
        centroid[d in 0...nbDimensions] <- size == 0 ? 0 :
                sum(cluster, o => coordinates[o][d]) / size;

        // Compute the variance of the cluster
        squares[d in 0...nbDimensions] <- sum(cluster,
                o => pow(coordinates[o][d] - centroid[d], 2));
        variances[c] <- sum[d in 0...nbDimensions](squares[d]);
    }

    // Minimize the total variance
    obj <- sum[c in 0...k](variances[c]);
    minimize obj;
}

/* Parametrize the solver */
function param() {
    if (hxTimeLimit == nil) hxTimeLimit = 5;
}

/* Write the solution in a file in the following format:
 *  - objective value
 *  - k
 *  - for each cluster, a line with the elements in the cluster (separated by spaces) */
function output() {
    if (solFileName == nil) return;
    local solFile = io.openWrite(solFileName);
    solFile.println(obj.value);
    solFile.println(k);
    for [c in 0...k] {
        for [o in clusters[c].value]
            solFile.print(o + " ");
        solFile.println();
    }
}
Execution (Windows)
set PYTHONPATH=%HX_HOME%\bin\python
python kmeans.py instances\iris.dat
 
Execution (Linux)
export PYTHONPATH=/opt/hexaly_13_0/bin/python
python kmeans.py instances/iris.dat
import hexaly.optimizer
import sys

def read_elem(filename):
    with open(filename) as f:
        return [str(elem) for elem in f.read().split()]

#
# Read instance data
#
def read_instance(filename):
    file_it = iter(read_elem(filename))

    # Data properties
    nb_observations = int(next(file_it))
    nb_dimensions = int(next(file_it))

    coordinates_data = [None] * nb_observations
    for o in range(nb_observations):
        coordinates_data[o] = [None] * (nb_dimensions)
        for d in range(nb_dimensions):
            coordinates_data[o][d] = float(next(file_it))
        next(file_it) # skip initial clusters

    return nb_observations, nb_dimensions, coordinates_data

def main(instance_file, output_file, time_limit, k):
    nb_observations, nb_dimensions, coordinates_data = read_instance(instance_file)

    with hexaly.optimizer.HexalyOptimizer() as optimizer:
        #
        # Declare the optimization model
        #
        model = optimizer.model

        # clusters[c] represents the points in cluster c
        clusters = [model.set(nb_observations) for c in range(k)]

        # Each point must be in one cluster and one cluster only
        model.constraint(model.partition(clusters))

        # Coordinates of points
        coordinates = model.array(coordinates_data)

        # Compute variances
        variances = []
        for cluster in clusters:
            size = model.count(cluster)

            # Compute centroid of cluster
            centroid = [0 for d in range(nb_dimensions)]
            for d in range(nb_dimensions):
                coordinate_lambda = model.lambda_function(
                    lambda i: model.at(coordinates, i, d))
                centroid[d] = model.iif(
                    size == 0,
                    0,
                    model.sum(cluster, coordinate_lambda) / size)

            # Compute variance of cluster
            variance = model.sum()
            for d in range(nb_dimensions):
                dimension_variance_lambda = model.lambda_function(lambda i: model.sum(
                    model.pow(model.at(coordinates, i, d) - centroid[d], 2)))
                dimension_variance = model.sum(cluster, dimension_variance_lambda)
                variance.add_operand(dimension_variance)
            variances.append(variance)

        # Minimize the total variance
        obj = model.sum(variances)
        model.minimize(obj)

        model.close()

        # Parameterize the optimizer
        optimizer.param.time_limit = time_limit

        optimizer.solve()

        #
        # Write the solution in a file in the following format:
        #  - objective value
        #  - k
        #  - for each cluster, a line with the elements in the cluster
        #    (separated by spaces)
        #
        if output_file != None:
            with open(output_file, 'w') as f:
                f.write("%f\n" % obj.value)
                f.write("%d\n" % k)
                for c in range(k):
                    for o in clusters[c].value:
                        f.write("%d " % o)
                    f.write("\n")

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
        sys.exit(1)

    instance_file = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) >= 3 else None
    time_limit = int(sys.argv[3]) if len(sys.argv) >= 4 else 60
    k = int(sys.argv[4]) if len(sys.argv) >= 5 else 2
    main(instance_file, output_file, time_limit, k)
Compilation / Execution (Windows)
cl /EHsc kmeans.cpp -I%HX_HOME%\include /link %HX_HOME%\bin\hexaly130.lib
kmeans instances\iris.dat
 
 
Compilation / Execution (Linux)
g++ kmeans.cpp -I/opt/hexaly_13_0/include -lhexaly130 -lpthread -o kmeans
./kmeans instances/iris.dat
#include "optimizer/hexalyoptimizer.h"
#include <fstream>
#include <iostream>
#include <limits>
#include <vector>

using namespace hexaly;
using namespace std;

class Kmeans {
public:
    // Data properties
    int nbObservations;
    int nbDimensions;
    int k;

    vector<vector<double>> coordinatesData;

    // Hexaly Optimizer
    HexalyOptimizer optimizer;

    // Decisions
    vector<HxExpression> clusters;

    // Objective
    HxExpression obj;

    Kmeans(int k) : k(k) {}

    // Read instance data
    void readInstance(const string& fileName) {
        ifstream infile;
        infile.exceptions(ifstream::failbit | ifstream::badbit);
        infile.open(fileName.c_str());

        infile >> nbObservations;
        infile >> nbDimensions;

        coordinatesData.resize(nbObservations);
        string tmp;
        for (int o = 0; o < nbObservations; ++o) {
            coordinatesData[o].resize(nbDimensions);
            for (int d = 0; d < nbDimensions; ++d) {
                infile >> coordinatesData[o][d];
            }
            infile >> tmp; // skip initial clusters
        }
    }

    void solve(int limit) {
        // Declare the optimization model
        HxModel model = optimizer.getModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters.resize(k);
        for (int c = 0; c < k; ++c) {
            clusters[c] = model.setVar(nbObservations);
        }

        // Each point must be in one cluster and one cluster only
        model.constraint(model.partition(clusters.begin(), clusters.end()));

        // Coordinates of points
        HxExpression coordinates = model.array();
        for (int o = 0; o < nbObservations; ++o) {
            coordinates.addOperand(model.array(coordinatesData[o].begin(), coordinatesData[o].end()));
        }

        // Compute variances
        vector<HxExpression> variances;
        variances.resize(k);
        for (int c = 0; c < k; ++c) {
            HxExpression cluster = clusters[c];
            HxExpression size = model.count(cluster);

            // Compute the centroid of the cluster
            HxExpression centroid = model.array();
            for (int d = 0; d < nbDimensions; ++d) {
                HxExpression coordinateLambda =
                    model.createLambdaFunction([&](HxExpression o) { return model.at(coordinates, o, d); });
                centroid.addOperand(model.iif(size == 0, 0, model.sum(cluster, coordinateLambda) / size));
            }

            // Compute the variance of the cluster
            HxExpression variance = model.sum();
            for (int d = 0; d < nbDimensions; ++d) {
                HxExpression dimensionVarianceLambda = model.createLambdaFunction(
                    [&](HxExpression o) { return model.pow(model.at(coordinates, o, d) - model.at(centroid, d), 2); });
                HxExpression dimensionVariance = model.sum(cluster, dimensionVarianceLambda);
                variance.addOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.sum(variances.begin(), variances.end());
        model.minimize(obj);

        model.close();

        // Parametrize the optimizer
        optimizer.getParam().setTimeLimit(limit);

        optimizer.solve();
    }

    /* Write the solution in a file in the following format:
     *  - objective value
     *  - k
     *  - for each cluster, a line with the elements in the cluster (separated by spaces) */
    void writeSolution(const string& fileName) {
        ofstream outfile;
        outfile.exceptions(ofstream::failbit | ofstream::badbit);
        outfile.open(fileName.c_str());

        outfile << obj.getDoubleValue() << endl;
        outfile << k << endl;
        for (int c = 0; c < k; ++c) {
            HxCollection clusterCollection = clusters[c].getCollectionValue();
            for (int i = 0; i < clusterCollection.count(); ++i) {
                outfile << clusterCollection[i] << " ";
            }
            outfile << endl;
        }
    }
};

int main(int argc, char** argv) {
    if (argc < 2) {
        cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
        return 1;
    }

    const char* instanceFile = argv[1];
    const char* solFile = argc > 2 ? argv[2] : NULL;
    const char* strTimeLimit = argc > 3 ? argv[3] : "5";
    const char* k = argc > 4 ? argv[4] : "2";

    try {
        Kmeans model(atoi(k));
        model.readInstance(instanceFile);
        model.solve(atoi(strTimeLimit));
        if (solFile != NULL)
            model.writeSolution(solFile);
        return 0;
    } catch (const exception& e) {
        cerr << "An error occurred: " << e.what() << endl;
        return 1;
    }
}
Compilation / Execution (Windows)
copy %HX_HOME%\bin\Hexaly.NET.dll .
csc Kmeans.cs /reference:Hexaly.NET.dll
Kmeans instances\iris.dat
using System;
using System.IO;
using System.Globalization;
using Hexaly.Optimizer;

public class Kmeans : IDisposable
{
    // Data properties
    int nbObservations;
    int nbDimensions;
    int k;

    double[][] coordinatesData;

    // Hexaly Optimizer
    HexalyOptimizer optimizer;

    // Decisions
    HxExpression[] clusters;

    // Objective
    HxExpression obj;

    public Kmeans(int k)
    {
        optimizer = new HexalyOptimizer();
        this.k = k;
    }

    // Read instance data
    public void ReadInstance(string fileName)
    {
        using (StreamReader input = new StreamReader(fileName))
        {
            string[] splittedLine = input.ReadLine().Split();

            nbObservations = int.Parse(splittedLine[0]);
            nbDimensions = int.Parse(splittedLine[1]);

            coordinatesData = new double[nbObservations][];
            for (int o = 0; o < nbObservations; ++o)
            {
                splittedLine = input.ReadLine().Split();
                coordinatesData[o] = new double[nbDimensions];
                for (int d = 0; d < nbDimensions; ++d)
                    coordinatesData[o][d] = double.Parse(
                        splittedLine[d],
                        CultureInfo.InvariantCulture
                    );
            }
        }
    }

    public void Dispose()
    {
        if (optimizer != null)
            optimizer.Dispose();
    }

    public void Solve(int limit)
    {
        // Declare the optimization model
        HxModel model = optimizer.GetModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters = new HxExpression[k];
        for (int c = 0; c < k; ++c)
            clusters[c] = model.Set(nbObservations);

        // Each point must be in one cluster and one cluster only
        model.Constraint(model.Partition(clusters));

        // Coordinates of points
        HxExpression coordinates = model.Array(coordinatesData);

        // Compute variances
        HxExpression[] variances = new HxExpression[k];
        for (int c = 0; c < k; ++c)
        {
            HxExpression cluster = clusters[c];
            HxExpression size = model.Count(cluster);

            // Compute the centroid of the cluster
            HxExpression centroid = model.Array();
            for (int d = 0; d < nbDimensions; ++d)
            {
                HxExpression coordinateLambda = model.LambdaFunction(
                    o => model.At(coordinates, o, model.CreateConstant(d))
                );
                centroid.AddOperand(
                    model.If(size == 0, 0, model.Sum(cluster, coordinateLambda) / size)
                );
            }

            // Compute the variance of the cluster
            HxExpression variance = model.Sum();
            for (int d = 0; d < nbDimensions; ++d)
            {
                HxExpression dimensionVarianceLambda = model.LambdaFunction(
                    o =>
                        model.Pow(
                            model.At(coordinates, o, model.CreateConstant(d))
                                - model.At(centroid, model.CreateConstant(d)),
                            2
                        )
                );
                HxExpression dimensionVariance = model.Sum(cluster, dimensionVarianceLambda);
                variance.AddOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.Sum(variances);
        model.Minimize(obj);

        model.Close();

        // Parametrize the optimizer
        optimizer.GetParam().SetTimeLimit(limit);

        optimizer.Solve();
    }

    /* Write the solution in a file in the following format:
     *  - objective value
     *  - k
     *  - for each cluster, a line with the elements in the cluster (separated by spaces) */
    public void WriteSolution(string fileName)
    {
        using (StreamWriter output = new StreamWriter(fileName))
        {
            output.WriteLine(obj.GetDoubleValue());
            output.WriteLine(k);
            for (int c = 0; c < k; ++c)
            {
                HxCollection clusterCollection = clusters[c].GetCollectionValue();
                for (int i = 0; i < clusterCollection.Count(); ++i)
                    output.Write(clusterCollection[i] + " ");
                output.WriteLine();
            }
            output.Close();
        }
    }

    public static void Main(string[] args)
    {
        if (args.Length < 1)
        {
            Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
            Environment.Exit(1);
        }

        string instanceFile = args[0];
        string outputFile = args.Length > 1 ? args[1] : null;
        string strTimeLimit = args.Length > 2 ? args[2] : "5";
        string k = args.Length > 3 ? args[3] : "2";
        using (Kmeans model = new Kmeans(int.Parse(k)))
        {
            model.ReadInstance(instanceFile);
            model.Solve(int.Parse(strTimeLimit));
            if (outputFile != null)
                model.WriteSolution(outputFile);
        }
    }
}
Compilation / Execution (Windows)
javac Kmeans.java -cp %HX_HOME%\bin\hexaly.jar
java -cp %HX_HOME%\bin\hexaly.jar;. Kmeans instances\iris.dat
Compilation / Execution (Linux)
javac Kmeans.java -cp /opt/hexaly_13_0/bin/hexaly.jar
java -cp /opt/hexaly_13_0/bin/hexaly.jar:. Kmeans instances/iris.dat
import java.util.*;
import java.io.*;
import com.hexaly.optimizer.*;

public class Kmeans {
    // Data properties
    private int nbObservations;
    private int nbDimensions;
    private int k;

    private double[][] coordinatesData;

    // Hexaly Optimizer
    private final HexalyOptimizer optimizer;

    // Decisions
    private HxExpression[] clusters;

    // Objective
    private HxExpression obj;

    private Kmeans(HexalyOptimizer optimizer) {
        this.optimizer = optimizer;
    }

    // Read instance data
    private void readInstance(int k, String fileName) throws IOException {
        try (Scanner input = new Scanner(new File(fileName))) {
            input.useLocale(Locale.ROOT);

            nbObservations = input.nextInt();
            nbDimensions = input.nextInt();

            this.k = k;
            coordinatesData = new double[nbObservations][nbDimensions];
            for (int o = 0; o < nbObservations; ++o) {
                for (int d = 0; d < nbDimensions; ++d) {
                    coordinatesData[o][d] = input.nextDouble();
                }
                input.next(); // skip initial clusters
            }
        }
    }

    private void solve(int limit) {
        // Declare the optimization model
        HxModel model = optimizer.getModel();

        // Set decisions: clusters[c] represents the points in cluster c
        clusters = new HxExpression[k];
        for (int c = 0; c < k; ++c) {
            clusters[c] = model.setVar(nbObservations);
        }

        // Each point must be in one cluster and one cluster only
        model.constraint(model.partition(clusters));

        // Coordinates of points
        HxExpression coordinates = model.array(coordinatesData);

        // Compute variances
        HxExpression[] variances = new HxExpression[k];
        for (int c = 0; c < k; ++c) {
            HxExpression cluster = clusters[c];
            HxExpression size = model.count(cluster);

            // Compute the centroid of the cluster
            HxExpression centroid = model.array();
            for (int d = 0; d < nbDimensions; ++d) {
                HxExpression dExpr = model.createConstant(d);
                HxExpression coordinateLambda = model.lambdaFunction(o -> model.at(coordinates, o, dExpr));
                centroid
                    .addOperand(model.iif(model.eq(size, 0), 0, model.div(model.sum(cluster, coordinateLambda), size)));
            }

            // Compute the variance of the cluster
            HxExpression variance = model.sum();
            for (int d = 0; d < nbDimensions; ++d) {
                HxExpression dExpr = model.createConstant(d);
                HxExpression dimensionVarianceLambda = model.lambdaFunction(
                    o -> model.pow(model.sub(model.at(coordinates, o, dExpr), model.at(centroid, dExpr)), 2));
                HxExpression dimensionVariance = model.sum(cluster, dimensionVarianceLambda);
                variance.addOperand(dimensionVariance);
            }
            variances[c] = variance;
        }

        // Minimize the total variance
        obj = model.sum(variances);
        model.minimize(obj);

        model.close();

        // Parametrize the optimizer
        optimizer.getParam().setTimeLimit(limit);

        optimizer.solve();
    }

    /*
     * Write the solution in a file in the following format:
     * - objective value
     * - k
     * - for each cluster, a line with the elements in the cluster (separated by
     * spaces)
     */
    private void writeSolution(String fileName) throws IOException {
        try (PrintWriter output = new PrintWriter(fileName)) {
            output.println(obj.getDoubleValue());
            output.println(k);
            for (int c = 0; c < k; ++c) {
                HxCollection clusterCollection = clusters[c].getCollectionValue();
                for (int i = 0; i < clusterCollection.count(); ++i) {
                    output.print(clusterCollection.get(i) + " ");
                }
                output.println();
            }
        }
    }

    public static void main(String[] args) {
        if (args.length < 1) {
            System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
            System.exit(1);
        }

        String instanceFile = args[0];
        String outputFile = args.length > 1 ? args[1] : null;
        String strTimeLimit = args.length > 2 ? args[2] : "5";
        String k = args.length > 3 ? args[3] : "2";

        try (HexalyOptimizer optimizer = new HexalyOptimizer()) {
            Kmeans model = new Kmeans(optimizer);
            model.readInstance(Integer.parseInt(k), instanceFile);
            model.solve(Integer.parseInt(strTimeLimit));
            if (outputFile != null) {
                model.writeSolution(outputFile);
            }
        } catch (Exception ex) {
            System.err.println(ex);
            ex.printStackTrace();
            System.exit(1);
        }
    }
}

Results

Hexaly Optimizer reaches gaps below 1% for the K-Means Clustering Problem (MSSC) in 1 minute of running time, on the instances from the UCI Machine Learning Repository and TSPLIB research benchmarks, with more than 10,000 observations. Our K-Means Clustering Problem (MSSC) benchmark page illustrates how Hexaly Optimizer outperforms traditional general-purpose optimization solvers like Gurobi on this challenging problem.