This page is for an old version of Hexaly Optimizer. We recommend that you update your version and read the documentation for the latest stable release.

K-means¶

Principles learned¶

  • Define the actual set of decision variables
  • Define the float bounds from the data
  • Use of non-linearities: minimizing a sum of “min”

Problem¶

../_images/kmeans.png

Given a sample of n observations and a set of quantitative variables, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.

Download the example

Data¶

The format of the data is as follows:

  • 1st line: number of observations and number of variables
  • For each observation: coordinate along each variable, and the actual cluster it belongs to

Program¶

The only decision variables are x[c][v] to define the coordinate of the center of gravity of cluster c along variable v. There is no need for variables to determine which cluster an obervation belongs to: the distance with the closest cluster is computed thanks to the min operator. The objective is to minimize the sum of these distances.

Execution:
localsolver kmeans.lsp inFileName=instances/irit.dat [lsTimeLimit=] [solFileName=] [k=val]
/********** kmeans.lsp **********/
use io;

/* Reads instance data */
function input(){
    usage = "\nUsage: localsolver kmeans.lsp "
        + "inFileName=inputFile [solFileName=outputFile] [lsTimeLimit=timeLimit] [k=value]\n";

    if (inFileName == nil) throw usage;
    local f = io.openRead(inFileName);
    nbObservations = f.readInt();
    nbVariables = f.readInt();
    
    if (k == nil){
        k = 2;
    }
    
    for[o in 1..nbObservations]{
        for[v in 1..nbVariables]{
	        M[o][v] = f.readDouble();
	    }
        clusters[o] = f.readString();
    }
    
    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    for[v in 1..nbVariables]{
        mini[v] = min[o in 1..nbObservations](M[o][v]);
        maxi[v] = max[o in 1..nbObservations](M[o][v]);
    }
}

/* Declares the optimization model. */
function model(){
    // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
    x[1..k][v in 1..nbVariables] <- float(mini[v], maxi[v]);

    // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
    d[o in 1..nbObservations] <- min[c in 1..k](sum[v in 1..nbVariables](pow(x[c][v] - M[o][v], 2)));

    // Minimize the total distance
    obj <- sum[o in 1..nbObservations](d[o]);
    minimize obj;
}

/* Parameterizes the solver. */
function param(){
    if(lsTimeLimit == nil) lsTimeLimit=5;
}

/* Writes the solution in a file in the following format:
 *  - objective value
 *  - k
 *  - for each cluster, the coordinates along each variable
 */
function output(){
    if(solFileName == nil) return;
	local solFile = io.openWrite(solFileName);
    solFile.println(obj.value);
    solFile.println(k);
    for[c in 1..k] {
        for[v in 1..nbVariables] {
            solFile.print(x[c][v].value + " ");
        }
        solFile.println();
    }
}
Execution (Windows)
set PYTHONPATH=%LS_HOME%\bin\python27\
python kmeans.py instances\irit.dat
Execution (Linux)
export PYTHONPATH=/opt/localsolver_XXX/bin/python27/
python kmeans.py instances/irit.dat
########## kmeans.py ##########

import localsolver
import sys

if len(sys.argv) < 2:
    print ("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
    sys.exit(1)

def read_elem(filename):
    with open(filename) as f:
        return [str(elem) for elem in f.read().split()]

with localsolver.LocalSolver() as ls:

    #
    # Reads instance data
    #
    if len(sys.argv) > 4: k = int(sys.argv[4])
    else: k = 2

    file_it = iter(read_elem(sys.argv[1]))
    # Data properties 
    nbObservations = int(file_it.next())
    nbVariables = int(file_it.next())

    M = [None]*nbObservations
    qualitative = [None]*nbObservations
    for o in range(nbObservations):
        M[o] = [None]*(nbVariables)
        for v in range(nbVariables):
            M[o][v] = float(file_it.next())
        qualitative[o] = file_it.next()

    # mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    mini = [min(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]
    maxi = [max(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]

    #
    # Declares the optimization model
    #
    model = ls.model

    # x[o][v] are decision variables equal to the center of gravity of the kth group for the jit variable
    x = [[model.float(mini[v], maxi[v]) for v in range(nbVariables)] for c in range(k)]

    # d[o] is the distance between observation o and the nearest cluster (its center of gravity)
    d = [model.min(model.sum((x[c][v] - M[o][v])**2 for v in range(nbVariables)) for c in range(k))
        for o in range(nbObservations)]

    # Minimize the total distance
    obj = model.sum(d)
    model.minimize(obj)

    model.close()

    #
    # Parameterizes the solver
    #
    ls.param.nb_threads = 2
    if len(sys.argv) > 3: ls.create_phase().time_limit = int(sys.argv[3])
    else: ls.create_phase().time_limit = 5
  
    ls.solve()

    #
    # Writes the solution in a file in the following format:
    #  - objective value
    #  - k
    #  - for each cluster, the coordinates along each variable
    #
    if len(sys.argv) > 2:
        with open(sys.argv[2], 'w') as f:
            f.write("%f\n" % obj.value)
            f.write("%d\n" % k)
            for c in range(k):
                for v in range(nbVariables):
                    f.write("%f " % x[c][v].value)
                f.write("\n")
Compilation / Execution (Windows)
cl /EHsc kmeans.cpp -I%LS_HOME%\include /link %LS_HOME%\bin\localsolver.dll.lib
kmeans instances\irit.dat
Compilation / Execution (Linux)
g++ kmeans.cpp -I/opt/localsolver_XXX/include -llocalsolver -lpthread -o kmeans
./kmeans instances/irit.dat
/********** kmeans.cpp **********/

#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include "localsolver.h"

using namespace localsolver;
using namespace std;

class Kmeans {
public:
	/* Data properties */
	int nbObservations;
	int nbVariables;
    int k;

	vector<vector<lsdouble> > M;
	vector<string> clusters;

	// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
	vector<lsdouble> mini;
	vector<lsdouble> maxi;

    /* Solver. */
    LocalSolver localsolver;

    /* Decisions */
    vector<vector<LSExpression> > x;

    /* Objective */
    LSExpression obj;

    /* Constructor */
    Kmeans(int k) : k(k) {
        if (k < 1) {
            cerr << "Error: k=" << k << " it is supposed to be greater or equal than 1 " << endl;
            exit(1);
        }
    }

	/* Reads instance data */
	void readInstance(const string& fileName) {

        ifstream infile(fileName.c_str());
        if (!infile.is_open()) {
            cerr << "File " << fileName << " cannot be opened." << endl;
            exit(1);
        }
        infile >> nbObservations;
        infile >> nbVariables;

		M.resize(nbObservations);
		clusters.resize(nbObservations);
		mini.resize(nbVariables, numeric_limits<lsdouble>::infinity());
		maxi.resize(nbVariables, -numeric_limits<lsdouble>::infinity());
        for (int o = 0; o < nbObservations; o++) {
			M[o].resize(nbVariables);
			for (int v = 0; v < nbVariables; v++) {
	            infile >> M[o][v];
				if (M[o][v] < mini[v]) mini[v] = M[o][v];
				if (M[o][v] > maxi[v]) maxi[v] = M[o][v];
			}
			infile >> clusters[o];
        }
    }

	void solve(int limit) {
		try {
            /* Declares the optimization model. */
            LSModel model = localsolver.getModel();

            // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
			x.resize(k);
            for (int c = 0; c < k; c++) {
				x[c].resize(nbVariables);
				for (int v = 0; v < nbVariables; v++) {
	                x[c][v] = model.floatVar(mini[v], maxi[v]);
				}
            }

            // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
            vector<LSExpression> d(nbObservations);            
            for(int o = 0; o < nbObservations; o++) {
                d[o] = model.min();
                for(int c = 0; c < k; c++) {
                    LSExpression distanceCluster = model.sum();
                    for(int v = 0; v < nbVariables; v++) {
                        distanceCluster += model.pow(x[c][v] - M[o][v], 2);
                    }
                    d[o].addOperand(distanceCluster);
                }
            }

            // Minimize the total distance
            obj = model.sum(d.begin(), d.end());
            model.minimize(obj);

            model.close();

            /* Parameterizes the solver. */
            LSPhase phase = localsolver.createPhase();
            phase.setTimeLimit(limit);
            localsolver.solve(); 

        } catch (const LSException& e) {
            cerr << "LSException:" << e.getMessage() << endl;
            exit(1);
        }
    }

    /* Writes the solution in a file in the following format:
     *  - objective value
     *  - k
     *  - for each cluster, the coordinates along each variable
     */
    void writeSolution(const string& fileName) {
        ofstream outfile(fileName.c_str());
        if (!outfile.is_open()) {
            cerr << "File " << fileName << " cannot be opened." << endl;
            exit(1);
        }
		outfile << obj.getDoubleValue() << endl;
        outfile << k << endl;
        for(int c = 0; c < k; c++) {
            for(int v = 0; v < nbVariables; v++) {
                outfile << x[c][v].getDoubleValue() << " ";
            }
            outfile << endl;
        }
        outfile.close();
    }
};
    
int main(int argc, char** argv) {
    if (argc < 2) {
        cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
        exit(1);
    }

    const char* instanceFile = argv[1];
    const char* solFile = argc > 2 ? argv[2] : NULL;
    const char* strTimeLimit = argc > 3 ? argv[3] : "5";
    const char* k = argc > 4 ? argv[4] : "2";

    Kmeans model(atoi(k));
    model.readInstance(instanceFile);
    model.solve(atoi(strTimeLimit));

    if(solFile != NULL)
        model.writeSolution(solFile);
    return 0;
}
Compilation / Execution (Windows)
javac Kmeans.java -cp %LS_HOME%\bin\localsolver.jar
java -cp %LS_HOME%\bin\localsolver.jar;. Kmeans instances\irit.dat
Compilation/Execution (Linux)
javac Kmeans.java -cp /opt/localsolver_XXX/bin/localsolver.jar
java -cp /opt/localsolver_XXX/bin/localsolver.jar:. Kmeans instances/irit.dat
/********** Kmeans.java **********/

import java.util.*;
import java.io.*;
import localsolver.*;

public class Kmeans {
    /* Data properties */
    int nbObservations;
    int nbVariables;
    int k;

    double[][] M;
    String[] clusters;

    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    double[] mini;
    double[] maxi;

    /* Solver. */
    LocalSolver localsolver;

    /* Decisions. */
    LSExpression[][] x;

    /* Objective. */
    LSExpression obj;

    Kmeans(int k) {
        this.k = k;
    }

    /* Reads instance data */
    void readInstance(String fileName) {
        try {
            Scanner input = new Scanner(new File(fileName));
            input.useLocale(Locale.ROOT);

            nbObservations = input.nextInt();
            nbVariables = input.nextInt();

            M = new double[nbObservations][nbVariables];
            clusters = new String[nbObservations];
            mini = new double[nbVariables];
            maxi = new double[nbVariables];
            Arrays.fill(mini, Double.POSITIVE_INFINITY);
            Arrays.fill(maxi, Double.NEGATIVE_INFINITY);
            for (int o = 0; o < nbObservations; o++) {
                for (int v = 0; v < nbVariables; v++) {
                    M[o][v] = input.nextDouble();
                    if (M[o][v] < mini[v]) mini[v] = M[o][v];
                    else if (M[o][v] > maxi[v]) maxi[v] = M[o][v];
                }
                clusters[o] = input.next();
            }
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }

    void solve(int limit) {
        try {
            localsolver = new LocalSolver();

            /* Declares the optimization model. */
            LSModel model = localsolver.getModel();

            // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
            x = new LSExpression[k][nbVariables];
            for (int c = 0; c < k; c++) {
                for (int v = 0; v < nbVariables; v++) {
                    x[c][v] = model.floatVar(mini[v] ,maxi[v]);
                }
            }

            // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
            LSExpression[] d = new LSExpression[nbObservations];
            for (int o = 0; o < nbObservations; o++) {
                d[o] = model.min();
                for (int c = 0; c < k; c++) {
                    LSExpression distanceCluster = model.sum();
                    for (int v = 0; v < nbVariables; v++) {
                        distanceCluster.addOperand(model.pow(model.sub(x[c][v], M[o][v]), 2));
                    }
                    d[o].addOperand(distanceCluster);
                }
            }

            // Minimize the total distance
            obj = model.sum(d);
            model.minimize(obj);

            model.close();

            /* Parameterizes the solver. */
            LSPhase phase = localsolver.createPhase();
            phase.setTimeLimit(limit);

            localsolver.solve();  

        } catch (LSException e) {
            System.out.println("LSException:" + e.getMessage());
            System.exit(1);
        }
    }

    /* Writes the solution in a file in the following format:
     *  - objective value
     *  - k
     *  - for each cluster, the coordinates along each variable
     */
    void writeSolution(String fileName) {
        try {
            BufferedWriter output = new BufferedWriter(new FileWriter(fileName));
            output.write(obj.getDoubleValue() + "\n");
            output.write(k + "\n");
            for (int c = 0; c < k; c++) {
                for (int v = 0; v < nbVariables; v++) {
                    output.write(x[c][v].getDoubleValue() + " ");
                }
                output.write("\n");
            }
            output.close();
        } catch (IOException ex) {
            ex.printStackTrace();
        }
    }
    
    public static void main(String[] args) {
          if (args.length < 1) {
            System.out.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
            System.exit(1);
        }

        String instanceFile = args[0];
        String outputFile = args.length > 1 ? args[1] : null;
        String strTimeLimit = args.length > 2 ? args[2] : "5";
        String k = args.length > 3 ? args[3] : "2";

        Kmeans model = new Kmeans(Integer.parseInt(k));
        model.readInstance(instanceFile);
        model.solve(Integer.parseInt(strTimeLimit));
        if(outputFile != null) {
            model.writeSolution(outputFile);
        }
    }
}
Compilation/Execution (Windows)
copy %LS_HOME%\bin\*net.dll .
csc Kmeans.cs /reference:localsolvernet.dll
Kmeans instances\irit.dat
/********** kmeans.cs **********/

using System;
using System.IO;
using localsolver;

public class Kmeans : IDisposable
{
    /* Data properties */
    int nbObservations;
    int nbVariables;
    int k;

    double[,] M;
    string[] clusters;

    // mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
    double[] mini;
    double[] maxi;

    /* Solver. */
    LocalSolver localsolver;

    /* Decisions. */
    LSExpression[,] x;

    /* Objective. */
    LSExpression obj;

    public Kmeans(int k)
    {
        localsolver = new LocalSolver();
        this.k = k;
    }

    /* Reads instance data */
    public void ReadInstance (string fileName)
    {
        using (StreamReader input = new StreamReader(fileName))
        {       
            string[] splittedLine = input.ReadLine().Split();

            nbObservations = int.Parse(splittedLine[0]);
            nbVariables = int.Parse(splittedLine[1]);

            mini = new double[nbVariables];
            maxi = new double[nbVariables];
            for (int v = 0; v < nbVariables; v++)
            {
                mini[v] = double.PositiveInfinity;
                maxi[v] = double.NegativeInfinity;
            }
            M = new double[nbObservations,nbVariables];
            clusters = new string[nbObservations];
            for (int o = 0; o < nbObservations; o++) 
            {
                splittedLine = input.ReadLine().Split();
                for (int v = 0; v < nbVariables; v++)
                {
                    M[o,v] = double.Parse(splittedLine[v]);
                    if (M[o,v] < mini[v]) mini[v] = M[o,v];
                    if (M[o,v] > maxi[v]) maxi[v] = M[o,v];
                }
                clusters[o] = splittedLine[nbVariables];
            }
        }
    }

    public void Dispose ()
    {
        localsolver.Dispose();
    }

    public void Solve(int limit)
    {
        /* Declares the optimization model */
        LSModel model = localsolver.GetModel();

        // x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
        x = new LSExpression[k,nbVariables];
        for (int o = 0; o < k; o++) 
        {
            for (int v = 0; v < nbVariables; v++)
            {
                x[o,v] = model.Float(mini[v] ,maxi[v]);
            }
        }

        // d[o] is the distance between observation o and the nearest cluster (its center of gravity)
        LSExpression[] d = new LSExpression[nbObservations];
        for (int o = 0; o < nbObservations; o++)
        {
            d[o] = model.Min();
            for (int c = 0; c < k; c++)
            {
                LSExpression distanceCluster = model.Sum();
                for (int v = 0; v < nbVariables; v++)
                {
                    distanceCluster.AddOperand(model.Pow(x[c, v] - M[o, v], 2));
                }
                d[o].AddOperand(distanceCluster);
            }
        }

        // Minimize the total distance
        obj = model.Sum(d);
        model.Minimize(obj);

        model.Close();

        /* Parameterizes the solver. */
        LSPhase phase = localsolver.CreatePhase();
        phase.SetTimeLimit(limit);

        localsolver.Solve();  
    }

    /* Writes the solution in a file in the following format:
     *  - objective value
     *  - k
     *  - for each cluster, the coordinates along each variable
     */
    public void WriteSolution(string fileName)
    {
        using (StreamWriter output = new StreamWriter(fileName))
        {
            output.WriteLine(obj.GetDoubleValue());
            output.WriteLine(k);
            for (int c = 0; c < k; c++)
            {
                for (int v = 0; v < nbVariables; v++)
                {
                    output.Write(x[c, v].GetDoubleValue() + " ");
                }
                output.WriteLine();
            }
            output.Close();
        }
    }
    
    public static void Main(string[] args)
    {
          if (args.Length < 1)
        {
            Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
            System.Environment.Exit(1);
        }

        string instanceFile = args[0];
        string outputFile = args.Length > 1 ? args[1] : null;
        string strTimeLimit = args.Length > 2 ? args[2] : "5";
        string k = args.Length > 3 ? args[3] : "2";
        using (Kmeans model = new Kmeans(int.Parse(k)))
        {
            model.ReadInstance(instanceFile);
            model.Solve(int.Parse(strTimeLimit));
            if(outputFile != null)
            {
                model.WriteSolution(outputFile);
            }
        }
    }
}