new classifier

parent c5e698ae
......@@ -2,12 +2,13 @@
* \file bmf_classification_tool.cpp
* \brief Classification tool.
*
* Input pbm "image" with binary samples as columns and an ASCII space-separated file with class labels as numbers
* Procedure:
* 1) split dataset into training and testing subsets
* 2) learns a dictionary for the samples of class c=1,...,C within the training subset; this results in C dictionaries
* 3) classifies each sample x0 in the testing subset using each of the C dictionaries; this yields a set of scores {l_1,l_2,...,l_C}
* 4) declares sample x0 to belong to class c* if c* = argmin {l_c: c=1,...,C}
* Each input argument before a "--" appears is interpreted as a dictionary corresponding to one class.
* Classes are numbered in order of appearance. All dictionaries must have the same number of columns M,
* but may vary on the number of rows. Let C be the number of such classes.
* Each argument after the "--" is interpreted as data to be classified. They must also have a number
* of columns equal to M, but may vary on the number of rows: N_1, N_2, etc.
* For each data file of size N_i x M, a matrix of size N_i x C is produced, where each column contains, at row
* j, the score of representing the j-th row sample using each of the C dictionaries; lower values are better.
*/
#include <cstdio>
#include <cstdlib>
......@@ -30,12 +31,22 @@ idx_t K = 512;
bool image_mode = false;
bool force_mosaic = true;
bool force_residual_mosaic = true;
const char* Xname = "data/mnist_data.pbm";
const char* Lname = "data/mnist_labels.ascii";
const char* dictionary_files[1024]; // ok, no more than 1024 classes!
int number_of_classes = 0;
const char* data_files[1024]; // ok, no more than 1024 classes!
int number_of_data_files = 0;
#define MAX_CLASSES 1024
#define MAX_DATA_FILES 1024
binary_matrix dictionary[MAX_CLASSES];
binary_matrix data[MAX_DATA_FILES];
double* score[MAX_DATA_FILES];
void parse_args(int argc, char **argv) {
bool before_break = true;
for (int i = 0; i < argc; ++i) {
if (argv[i][0] == '-') {
if (before_break && (argv[i][0] == '-')) {
if (i == (argc-1)) {
std::cerr << "Missing argument for " << argv[i] << std::endl;
exit(-1);
......@@ -43,38 +54,11 @@ void parse_args(int argc, char **argv) {
const char* val = argv[i+1];
// std::cout << "Parameter " << argv[i] << " value " << val << std::endl;
switch (argv[i][1]) {
case 'i':
mi_algo = atoi(val);
break;
case 'c':
es_algo = atoi(val);
break;
case 'd':
du_algo = atoi(val);
break;
case 'l':
lm_algo = atoi(val);
break;
case 'L':
lmi_algo = atoi(val);
break;
case 'w':
W = (idx_t) atoi(val);
break;
case 'k':
K = (idx_t) atoi(val);
break;
case 'r':
random_seed = atol(val);
break;
case 'I':
image_mode = (atoi(val) > 0);
break;
case 'm':
force_mosaic = (atoi(val) > 0);
break;
case 'M':
force_residual_mosaic = (atoi(val) > 0);
case '-':
before_break = false;
break;
default:
std::cerr << "Invalid option " << argv[i] << std::endl;
......@@ -82,54 +66,109 @@ void parse_args(int argc, char **argv) {
}
i++;
} else {
Xname = argv[i];
Lname = argv[i+1];
if (before_break) {
dictionary_files[number_of_classes++] = argv[i];
} else {
data_files[number_of_data_files++] = argv[i];
}
}
}
}
int main(int argc, char **argv) {
static void init() {
memset(dictionary,MAX_CLASSES,sizeof(binary_matrix));
memset(data,MAX_CLASSES,sizeof(binary_matrix));
memset(score,MAX_CLASSES,sizeof(double*));
}
void load_mat(const char* fname, binary_matrix& A) {
idx_t rows,cols;
int res;
FILE* fX, *fL;
parse_args(argc,argv);
learn_model_setup(mi_algo,es_algo,du_algo,lm_algo,lmi_algo,0);
fX = fopen(Xname,"r");
if (!fX) return -1;
res = read_pbm_header(fX,rows,cols);
std::cout << "rows=" << rows << " cols=" << cols << std::endl;
//
// input data
//
binary_matrix X(rows,cols);
read_pbm_data(fX,X);
FILE* f = fopen(fname,"r");
if (!f) {
std::cerr << "Error opening " << fname << std::endl;
cleanup();
std::exit(1);
}
res = read_pbm_header(dfile,rows,cols);
if (res !=PBM_OK) {
std::cerr << "Error " << res << " reading image." << std::endl;
std::cerr << "Error reading header of " << fname << std::endl;
cleanup();
std::exit(1);
}
fclose(fX);
char* L = new char[cols];
int l;
unsigned i = 0;
fL = fopen(Lname,"r");
if (!fL) {
std::cerr << "Error reading labels file " << Lname << std::endl;
A.allocate(rows,cols);
res = read_pbm_data(f,A);
if (res != PBM_OK) {
std::cerr << "Error " << res << " reading " << fname << std::endl;
cleanup();
std::exit(1);
}
while (fscanf(fL,"%d ",&l)) {
L[i++] = l;
fclose(f);
}
void cleanup() {
for (idx_t c = 0; c < number_of_classes; c++) {
dictionary[c].destroy();
}
for (idx_t d = 0; d < number_of_data_files; d++) {
data[d].destroy();
if (score[d])
delete[] score[d];
}
fclose(fL);
}
int main(int argc, char **argv) {
//
// initialize all matrices to 0; this ensures
// destroy will do nothing on uninitialized matrices
//
init();
//
// parse parameters
//
parse_args(argc,argv);
//
// load class dictinonaries
//
idx_t M = 0;
for (idx_t c = 0; c < number_of_classes; c++) {
load_mat(dictionary_files[c], dictionary[c]);
if (M == 0) {
M = dictionary[c].get_cols();
} else {
if (M != dictionary[c].get_cols()) {
std::cerr << "All dictionaries must have the same number of columns " << M << std::endl;
std::exit(2);
}
}
}
//
// load data files
//
for (idx_t d = 0; d < number_of_data_files; d++) {
load_mat(data_files[d], data[d]);
}
//
// create score matrices
//
for (idx_t d = 0; d < number_of_data_files; d++) {
score[d].allocate( data[d].get_rows()*number_of_classes );
}
//
// classify
//
for (idx_t d = 0; d < number_of_data_files; d++) {
for (idx_t c = 0; c < number_of_classes; c++) {
// encode matrix ...
// set score somehow ...
// score[d][]...
}
}
//
// cleanup
//
X.destroy();
delete[] L;
cleanup();
std::exit(0);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment