This page is for an old version of Hexaly Optimizer.
We recommend that you update your version and read the documentation for the
latest stable release.
K-means¶
Principles learned¶
- Define the actual set of decision variables
- Define the float bounds from the data
- Use of non-linearities: minimizing a sum of “min”
Problem¶
Given a sample of n observations and a set of quantitative variables, the goal is to partition these observations into k clusters. Clusters are defined by their center of gravity. Each observation belongs to the cluster with the nearest center of gravity. For more details, see Wikipedia.
Download the exampleData¶
The format of the data is as follows:
- 1st line: number of observations and number of variables
- For each observation: coordinate along each variable, and the actual cluster it belongs to
Program¶
The only decision variables are x[c][v]
to define the coordinate of the
center of gravity of cluster c along variable v. There is no need for variables
to determine which cluster an observation belongs to: the distance with the
closest cluster is computed thanks to the min operator. The objective is to
minimize the sum of these distances.
- Execution:
- localsolver kmeans.lsp inFileName=instances/irit.dat [lsTimeLimit=] [solFileName=] [k=val]
/********** kmeans.lsp **********/
use io;
/* Reads instance data */
function input(){
usage = "\nUsage: localsolver kmeans.lsp "
+ "inFileName=inputFile [solFileName=outputFile] [lsTimeLimit=timeLimit] [k=value]\n";
if (inFileName == nil) throw usage;
local f = io.openRead(inFileName);
nbObservations = f.readInt();
nbVariables = f.readInt();
if (k == nil){
k = 2;
}
for[o in 1..nbObservations]{
for[v in 1..nbVariables]{
M[o][v] = f.readDouble();
}
clusters[o] = f.readString();
}
// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
for[v in 1..nbVariables]{
mini[v] = min[o in 1..nbObservations](M[o][v]);
maxi[v] = max[o in 1..nbObservations](M[o][v]);
}
}
/* Declares the optimization model. */
function model(){
// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x[1..k][v in 1..nbVariables] <- float(mini[v], maxi[v]);
// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
d[o in 1..nbObservations] <- min[c in 1..k](sum[v in 1..nbVariables](pow(x[c][v] - M[o][v], 2)));
// Minimize the total distance
obj <- sum[o in 1..nbObservations](d[o]);
minimize obj;
}
/* Parameterizes the solver. */
function param(){
if(lsTimeLimit == nil) lsTimeLimit=5;
}
/* Writes the solution in a file in the following format:
* - objective value
* - k
* - for each cluster, the coordinates along each variable
*/
function output(){
if(solFileName == nil) return;
local solFile = io.openWrite(solFileName);
solFile.println(obj.value);
solFile.println(k);
for[c in 1..k] {
for[v in 1..nbVariables] {
solFile.print(x[c][v].value + " ");
}
solFile.println();
}
}
- Execution (Windows)
- set PYTHONPATH=%LS_HOME%\bin\python27\python kmeans.py instances\irit.dat
- Execution (Linux)
- export PYTHONPATH=/opt/localsolver_XXX/bin/python27/python kmeans.py instances/irit.dat
########## kmeans.py ##########
import localsolver
import sys
if len(sys.argv) < 2:
print ("Usage: python kmeans.py inputFile [outputFile] [timeLimit] [k value]")
sys.exit(1)
def read_elem(filename):
with open(filename) as f:
return [str(elem) for elem in f.read().split()]
with localsolver.LocalSolver() as ls:
#
# Reads instance data
#
if len(sys.argv) > 4: k = int(sys.argv[4])
else: k = 2
file_it = iter(read_elem(sys.argv[1]))
# Data properties
nbObservations = int(file_it.next())
nbVariables = int(file_it.next())
M = [None]*nbObservations
qualitative = [None]*nbObservations
for o in range(nbObservations):
M[o] = [None]*(nbVariables)
for v in range(nbVariables):
M[o][v] = float(file_it.next())
qualitative[o] = file_it.next()
# mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
mini = [min(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]
maxi = [max(M[o][v] for o in range(nbObservations)) for v in range(nbVariables)]
#
# Declares the optimization model
#
model = ls.model
# x[o][v] are decision variables equal to the center of gravity of the kth group for the jit variable
x = [[model.float(mini[v], maxi[v]) for v in range(nbVariables)] for c in range(k)]
# d[o] is the distance between observation o and the nearest cluster (its center of gravity)
d = [model.min(model.sum((x[c][v] - M[o][v])**2 for v in range(nbVariables)) for c in range(k))
for o in range(nbObservations)]
# Minimize the total distance
obj = model.sum(d)
model.minimize(obj)
model.close()
#
# Parameterizes the solver
#
ls.param.nb_threads = 2
if len(sys.argv) > 3: ls.create_phase().time_limit = int(sys.argv[3])
else: ls.create_phase().time_limit = 5
ls.solve()
#
# Writes the solution in a file in the following format:
# - objective value
# - k
# - for each cluster, the coordinates along each variable
#
if len(sys.argv) > 2:
with open(sys.argv[2], 'w') as f:
f.write("%f\n" % obj.value)
f.write("%d\n" % k)
for c in range(k):
for v in range(nbVariables):
f.write("%f " % x[c][v].value)
f.write("\n")
- Compilation / Execution (Windows)
- cl /EHsc kmeans.cpp -I%LS_HOME%\include /link %LS_HOME%\bin\localsolver.dll.libkmeans instances\irit.dat
- Compilation / Execution (Linux)
- g++ kmeans.cpp -I/opt/localsolver_XXX/include -llocalsolver -lpthread -o kmeans./kmeans instances/irit.dat
//********* kmeans.cpp *********
#include <iostream>
#include <fstream>
#include <vector>
#include <limits>
#include "localsolver.h"
using namespace localsolver;
using namespace std;
class Kmeans {
public:
// Data properties
int nbObservations;
int nbVariables;
int k;
vector<vector<lsdouble> > M;
vector<string> clusters;
// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
vector<lsdouble> mini;
vector<lsdouble> maxi;
// Solver.
LocalSolver localsolver;
// Decisions
vector<vector<LSExpression> > x;
// Objective
LSExpression obj;
// Constructor
Kmeans(int k) : k(k) { }
// Reads instance data
void readInstance(const string& fileName) {
ifstream infile;
infile.exceptions(ifstream::failbit | ifstream::badbit);
infile.open(fileName.c_str());
infile >> nbObservations;
infile >> nbVariables;
M.resize(nbObservations);
clusters.resize(nbObservations);
mini.resize(nbVariables, numeric_limits<lsdouble>::infinity());
maxi.resize(nbVariables, -numeric_limits<lsdouble>::infinity());
for (int o = 0; o < nbObservations; o++) {
M[o].resize(nbVariables);
for (int v = 0; v < nbVariables; v++) {
infile >> M[o][v];
if (M[o][v] < mini[v]) mini[v] = M[o][v];
if (M[o][v] > maxi[v]) maxi[v] = M[o][v];
}
infile >> clusters[o];
}
}
void solve(int limit) {
// Declares the optimization model.
LSModel model = localsolver.getModel();
// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x.resize(k);
for (int c = 0; c < k; c++) {
x[c].resize(nbVariables);
for (int v = 0; v < nbVariables; v++) {
x[c][v] = model.floatVar(mini[v], maxi[v]);
}
}
// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
vector<LSExpression> d(nbObservations);
for(int o = 0; o < nbObservations; o++) {
d[o] = model.min();
for(int c = 0; c < k; c++) {
LSExpression distanceCluster = model.sum();
for(int v = 0; v < nbVariables; v++) {
distanceCluster += model.pow(x[c][v] - M[o][v], 2);
}
d[o].addOperand(distanceCluster);
}
}
// Minimize the total distance
obj = model.sum(d.begin(), d.end());
model.minimize(obj);
model.close();
// Parameterizes the solver.
LSPhase phase = localsolver.createPhase();
phase.setTimeLimit(limit);
localsolver.solve();
}
// Writes the solution in a file in the following format:
// - objective value
// - k
// - for each cluster, the coordinates along each variable
void writeSolution(const string& fileName) {
ofstream outfile;
outfile.exceptions(ofstream::failbit | ofstream::badbit);
outfile.open(fileName.c_str());
outfile << obj.getDoubleValue() << endl;
outfile << k << endl;
for(int c = 0; c < k; c++) {
for(int v = 0; v < nbVariables; v++) {
outfile << x[c][v].getDoubleValue() << " ";
}
outfile << endl;
}
}
};
int main(int argc, char** argv) {
if (argc < 2) {
cerr << "Usage: kmeans inputFile [outputFile] [timeLimit] [k value]" << endl;
return 1;
}
const char* instanceFile = argv[1];
const char* solFile = argc > 2 ? argv[2] : NULL;
const char* strTimeLimit = argc > 3 ? argv[3] : "5";
const char* k = argc > 4 ? argv[4] : "2";
try {
Kmeans model(atoi(k));
model.readInstance(instanceFile);
model.solve(atoi(strTimeLimit));
if(solFile != NULL) model.writeSolution(solFile);
return 0;
} catch (const exception& e){
cerr << "Error occurred: " << e.what() << endl;
return 1;
}
}
- Compilation/Execution (Windows)
- copy %LS_HOME%\bin\*net.dll .csc Kmeans.cs /reference:localsolvernet.dllKmeans instances\irit.dat
/********** kmeans.cs **********/
using System;
using System.IO;
using localsolver;
public class Kmeans : IDisposable
{
// Data properties
int nbObservations;
int nbVariables;
int k;
double[][] M;
string[] clusters;
// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
double[] mini;
double[] maxi;
// Solver.
LocalSolver localsolver;
// Decisions.
LSExpression[][] x;
// Objective.
LSExpression obj;
public Kmeans(int k)
{
localsolver = new LocalSolver();
this.k = k;
}
// Reads instance data
public void ReadInstance(string fileName)
{
using (StreamReader input = new StreamReader(fileName))
{
string[] splittedLine = input.ReadLine().Split();
nbObservations = int.Parse(splittedLine[0]);
nbVariables = int.Parse(splittedLine[1]);
mini = new double[nbVariables];
maxi = new double[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
mini[v] = double.PositiveInfinity;
maxi[v] = double.NegativeInfinity;
}
M = new double[nbObservations][];;
clusters = new string[nbObservations];
for (int o = 0; o < nbObservations; o++)
{
splittedLine = input.ReadLine().Split();
M[o] = new double[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
M[o][v] = double.Parse(splittedLine[v]);
mini[v] = Math.Min(M[o][v], mini[v]);
maxi[v] = Math.Max(M[o][v], maxi[v]);
}
clusters[o] = splittedLine[nbVariables];
}
}
}
public void Dispose()
{
if (localsolver != null)
localsolver.Dispose();
}
public void Solve(int limit)
{
// Declares the optimization model
LSModel model = localsolver.GetModel();
// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x = new LSExpression[k][];
for (int o = 0; o < k; o++)
{
x[o] = new LSExpression[nbVariables];
for (int v = 0; v < nbVariables; v++)
{
x[o][v] = model.Float(mini[v], maxi[v]);
}
}
// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
LSExpression[] d = new LSExpression[nbObservations];
for (int o = 0; o < nbObservations; o++)
{
d[o] = model.Min();
for (int c = 0; c < k; c++)
{
LSExpression distanceCluster = model.Sum();
for (int v = 0; v < nbVariables; v++)
{
distanceCluster.AddOperand(model.Pow(x[c][v] - M[o][v], 2));
}
d[o].AddOperand(distanceCluster);
}
}
// Minimize the total distance
obj = model.Sum(d);
model.Minimize(obj);
model.Close();
// Parameterizes the solver.
LSPhase phase = localsolver.CreatePhase();
phase.SetTimeLimit(limit);
localsolver.Solve();
}
// Writes the solution in a file in the following format:
// - objective value
// - k
// - for each cluster, the coordinates along each variable
public void WriteSolution(string fileName)
{
using (StreamWriter output = new StreamWriter(fileName))
{
output.WriteLine(obj.GetDoubleValue());
output.WriteLine(k);
for (int c = 0; c < k; c++)
{
for (int v = 0; v < nbVariables; v++)
{
output.Write(x[c][v].GetDoubleValue() + " ");
}
output.WriteLine();
}
output.Close();
}
}
public static void Main(string[] args)
{
if (args.Length < 1)
{
Console.WriteLine("Usage: Kmeans inputFile [outputFile] [timeLimit] [k value]");
Environment.Exit(1);
}
string instanceFile = args[0];
string outputFile = args.Length > 1 ? args[1] : null;
string strTimeLimit = args.Length > 2 ? args[2] : "5";
string k = args.Length > 3 ? args[3] : "2";
using (Kmeans model = new Kmeans(int.Parse(k)))
{
model.ReadInstance(instanceFile);
model.Solve(int.Parse(strTimeLimit));
if (outputFile != null)
{
model.WriteSolution(outputFile);
}
}
}
}
- Compilation / Execution (Windows)
- javac Kmeans.java -cp %LS_HOME%\bin\localsolver.jarjava -cp %LS_HOME%\bin\localsolver.jar;. Kmeans instances\irit.dat
- Compilation/Execution (Linux)
- javac Kmeans.java -cp /opt/localsolver_XXX/bin/localsolver.jarjava -cp /opt/localsolver_XXX/bin/localsolver.jar:. Kmeans instances/irit.dat
/********** Kmeans.java **********/
import java.util.*;
import java.io.*;
import localsolver.*;
public class Kmeans {
// Data properties
private int nbObservations;
private int nbVariables;
private int k;
private double[][] M;
private String[] clusters;
// mini[v] and maxi[v] are respectively the minimum and maximum of the vth variable
private double[] mini;
private double[] maxi;
// Solver.
private LocalSolver localsolver;
// Decisions.
private LSExpression[][] x;
// Objective.
private LSExpression obj;
private Kmeans(int k) {
this.k = k;
}
// Reads instance data
private void readInstance(String fileName) throws IOException {
try (Scanner input = new Scanner(new File(fileName))) {
input.useLocale(Locale.ROOT);
nbObservations = input.nextInt();
nbVariables = input.nextInt();
M = new double[nbObservations][nbVariables];
clusters = new String[nbObservations];
mini = new double[nbVariables];
maxi = new double[nbVariables];
Arrays.fill(mini, Double.POSITIVE_INFINITY);
Arrays.fill(maxi, Double.NEGATIVE_INFINITY);
for (int o = 0; o < nbObservations; o++) {
for (int v = 0; v < nbVariables; v++) {
M[o][v] = input.nextDouble();
mini[v] = Math.min(M[o][v], mini[v]);
maxi[v] = Math.max(M[o][v], maxi[v]);
}
clusters[o] = input.next();
}
}
}
private void solve(int limit) {
// Declares the optimization model.
localsolver = new LocalSolver();
LSModel model = localsolver.getModel();
// x[c][v] is the coordinate for the center of gravity of the cluster c along the variable v
x = new LSExpression[k][nbVariables];
for (int c = 0; c < k; c++) {
for (int v = 0; v < nbVariables; v++) {
x[c][v] = model.floatVar(mini[v], maxi[v]);
}
}
// d[o] is the distance between observation o and the nearest cluster (its center of gravity)
LSExpression[] d = new LSExpression[nbObservations];
for (int o = 0; o < nbObservations; o++) {
d[o] = model.min();
for (int c = 0; c < k; c++) {
LSExpression distanceCluster = model.sum();
for (int v = 0; v < nbVariables; v++) {
distanceCluster.addOperand(model.pow(model.sub(x[c][v], M[o][v]), 2));
}
d[o].addOperand(distanceCluster);
}
}
// Minimize the total distance
obj = model.sum(d);
model.minimize(obj);
model.close();
// Parameterizes the solver.
LSPhase phase = localsolver.createPhase();
phase.setTimeLimit(limit);
localsolver.solve();
}
// Writes the solution in a file in the following format:
// - objective value
// - k
// - for each cluster, the coordinates along each variable
private void writeSolution(String fileName) throws IOException {
try (PrintWriter output = new PrintWriter(fileName)) {
output.println(obj.getDoubleValue());
output.println(k);
for (int c = 0; c < k; c++) {
for (int v = 0; v < nbVariables; v++) {
output.print(x[c][v].getDoubleValue() + " ");
}
output.println();
}
}
}
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Usage: java Kmeans inputFile [outputFile] [timeLimit] [k value]");
System.exit(1);
}
String instanceFile = args[0];
String outputFile = args.length > 1 ? args[1] : null;
String strTimeLimit = args.length > 2 ? args[2] : "5";
String k = args.length > 3 ? args[3] : "2";
try {
Kmeans model = new Kmeans(Integer.parseInt(k));
model.readInstance(instanceFile);
model.solve(Integer.parseInt(strTimeLimit));
if (outputFile != null) {
model.writeSolution(outputFile);
}
} catch (Exception ex) {
System.err.println(ex);
ex.printStackTrace();
System.exit(1);
}
}
}