//-*-Mode:C++-*- 
 
//== FILE HEADER BEGIN
//
// = FILENAME
//   baseclearn.cc
//
// = AUTHOR(S)
//   Johann Petrak
//
// = DESCRIPTION
//   baseclearn is a program that implements the following dumb
//   learning algorithms for classification learning
//     - always predict the most frequent class label (if there
//       is a tie, predict an arbitrary one)
//     - predict a fixed randomly choosen label for all test cases
//       (the first label encountered in the names file definition)
//
// = SYNOPSIS
//   baseclearn -h
//
// = COPYRIGHT
//   Copyright (2001) Austrian Research Institute
//   for Artificial Intelligence, Vienna, Austria
// 
// = CHANGES
//   2001/07/17 1.0: initial version
//
//===============


static char * rcsid =
"@(#)$Id: baseclearn.cc 1.1 2001/07/19 by johann $";

static char * pgmid =
"@(#)$Pgm: baseclearn.cc compiled on " __DATE__ " at " __TIME__ " $";


#include<stdio.h>
#include<stdlib.h>
#include<errno.h>

#include<fstream.h>
#include<string.h>

// some systems do have this in stdio.h, some dont. 
// Uncomment, if you get duplicate/previous declaration errors
extern "C" const char *const  sys_errlist[]; 

char mygetopt(int argc, char ** argv, char * str, char *& value);

// CONSTANTS ... arbitrary limits, most NOT checked yet
const unsigned int NAMELENGTH  = 1024; // max length of file name
const unsigned int LABELLENGTH = 1024; // max length of class label
// the maximum length of a single data record is BUFSIZE!
const unsigned int BUFSIZE = 200000;
// maximum number of allowed class labels
const unsigned int MAXLABELS = 1024;


// since this is such a small program, most vars a global!
bool verbose = false;
int  algnr = 0;
char * filestem = 0;


char * labelline = 0; // will allocate BUFSIZE later
char * buf       = 0; // will allocate BUFSIZE later
unsigned int maxn = 0;

char * labels[MAXLABELS];
unsigned long int counts[MAXLABELS];
int nlabels = 0;


int  curindex = 1;
  
char mygetopt(int argc, char ** argv, char * str, char *& value)
{
  char optchar;
  char * curopt;
  
  // is there any argument at all (not counting the program name)?
  if (curindex >= argc) return '\0';
  
  // get next option
  curopt = argv[curindex++];

  // hmmm it is not an option 
  if (*curopt++ != '-') return '?';

  // set the option character and skip it
  optchar = *curopt++;

  // look if the option char is found in the string
  while (* str && *str != optchar) str++;

  // if we didint find it ...
  if (! *str) return '?';

  // otherwise look if this option takes an argument,
  // if yes, put it into optarg
  if ( *++str == ':')
  {
    // if there was no space between option character and the option argument
    if (*curopt) value = curopt;
    // otherwise we take the next command argument, if there is one
    else if (argc>curindex) value = argv[curindex++];
    else optchar = '?';
  }
  return optchar;

#if 0
  int ret = getopt(argc,argv,str);
  if (ret == EOF) ret = 0;
  value =  optarg;
  return (char)ret;
#endif
}

void usage(char * command)
{
  cerr << "Usage:\n";
  cerr << "  " << command 
       << "  -l N -t|-e -f filestem -m modelfile [-v] [-h]\n\n"
       << "  Run a very basic learning algorithm depending on N\n\n"
       << "  -l N: N=1 same random label for all; N=2 label that was\n"
       << "     most frequent in train for all\n"
       << "  -t|-e: training(-t) or evaluation/testing (-e) phase.\n"
       << "     for training need\n"
       << "     filestem.data and filestem.names, for testing need\n"
       << "     filestem.test and filestem.names\n"
       << "  -f stem: filestem of input files\n"
       << "  -m name: full name of modelfile\n"
       << "  -p name: full name of prediction file (only for -e)\n"
       << "  -n N: maximum number of data files to read (only for -t), 0=no limit\n"
       << "  -v: print some statistics and process messages to stderr\n"
       << "  -h show this help\n"
       << "\n" << rcsid << "\n" << pgmid << "\n";
    ; 
}




char *pLabel(char * p)
/*    ----------  */
{
    char * t = p + strlen(p)  ;
    while (t!=p && (*t==' '||*t=='.')) { *t = 0; t--; }
    while ( t!=p && *t != ',' ) t--;
    if (*t == ',') t++;
    return t;
}

// open the names file and read lines until something is 
// found that looks like the label definition line
// the buffer passed as second argument will contain the
// line.
void getNames(char * filename) {
  ifstream in(filename);
  if (!in.is_open()) {
    cerr << "Couldnt open file " << filename << endl;
    exit(1);
  }
  while(in.getline(labelline,BUFSIZE,'\n')) {
    // check if it is a label definition line:
    // simple rule: must have a comma in it, but no colon
    if (strchr(labelline,',') && !strchr(labelline,':')) {
      cerr << "Found labels: " << labelline << endl;
      break;
    }
  }
  in.close();
}

void getFirstLabel(char * line, char * label) {
  // copy first label in a label definition line to second arg buffer
  while(*line == ' ' || *line == '\t')  line++;
  cerr << "Found start:";
  cerr << line << "<" << *line << endl;
  while(*line != ',') *(label++) = *(line++);
  *label = 0;
  
}

void writeLabel(char * filename, char * targetlabel) {
  ofstream out(filename);
  if (!out.is_open()) {
    cerr << "Couldnt open file " << filename << endl;
    exit(1);
  }
  out << targetlabel << endl;
  out.close();
}

void getLabel(char * filename, char * targetlabel) {
  ifstream in(filename);
  if (!in.is_open()) {
    cerr << "Couldnt open file " << filename << endl;
    exit(1);
  }
  in.getline(targetlabel,LABELLENGTH,'\n');
  in.close();
}

void makeTab(char * labelline) {
  nlabels = 0;
  while(*labelline != 0 && *labelline != '\n') {
    while(*labelline == ' ' || *labelline == '\t') 
      { 
	//cout << *labelline; 
	labelline++; }
    if (*labelline != 0) { 
      //cout << "NEW";
      nlabels++;
      counts[nlabels-1] = 0;
      labels[nlabels-1] = labelline;
      while(*labelline != 0 && *labelline != ',' && *labelline != '.') 
	{ 
	  //cout << *labelline; 
	  labelline++; }
      if (*labelline == ',' || *labelline == '.')
	{ 
	  //cout << "END"; 
	  *labelline = 0; labelline++; }
    }
  }
}


int main(int Argc, char** Argv)
{
  // Print programmname und version
  const char * compile_date = __DATE__;
  const char * compile_time = __TIME__;
  if (verbose) {
    cout << Argv[0] << ": baseclearn" << rcsid << "\n" << pgmid << "\n";
  }

  
  char *   modelfile  = 0;     // name of model file to read/write
  char *   predfile  = 0;      // name of fiel with predictions
  char namesfile[NAMELENGTH];  // name of names file to read
  char datafile[NAMELENGTH];   // name of data/test file to read

  char targetlabel[LABELLENGTH];


  int i;

  bool train = true;
  // ****************************
  // process command line options 
  char o;
  char * value;
  
  
  while ( (o = mygetopt(Argc, Argv,
			"l:f:m:p:n:tevh", value)) != '\0' )
  {
    
    switch (o)
    {
      case 'l':
      algnr = atoi(value);
      if (algnr < 1 || algnr > 2) {
	cerr << "Invalid algorithm number for -l\n";
	exit(1);
      }
      break; 

      case 'n':
      maxn = atoi(value);
      break; 

      case 'f':
      filestem = value;
      break; 

      case 'm':
      modelfile = value;
      break; 

      case 'p':
      predfile = value;
      break; 

      case 't':
      train = true;
      break; 

      case 'e':
      train = false;
      break; 


      case 'v':
      cerr << "Program version: " << rcsid << endl;
      cerr << "Compiled: " << __COMPILEDON__ << endl;
      verbose = true;
      break;
      

      case '?':
      cerr << "  Unknown option\n";
      usage(Argv[0]);
      exit(1);

      case 'h':
      usage(Argv[0]);
      exit(0);

    }
  }
  
  if (filestem == 0) {
    cerr << "Filestem missing (-f)!\n";
    exit(1);
  }

  if (modelfile == 0) {
    cerr << "Model file name missing (-m)!\n";
    exit(1);
  }

  if (!train && predfile == 0) {
    cerr << "Prediction file name missing for testing (-p)!\n";
    exit(1);
  }

  if (algnr == 0) {
    cerr << "Algorithm number missing (-l)!\n";
    exit(1);
  }


  strcpy(namesfile, filestem);
  strcat(namesfile,".names");
  strcpy(datafile, filestem);
  if (train) {
    strcat(datafile,".data");
  } else {
    strcat(datafile,".test");
  }
  
  if (verbose) {
    cerr << "Indata:    " << datafile << endl;
    cerr << "Innames:   " << namesfile << endl;
    cerr << "Modelfile: " << modelfile << endl;
    cerr << "Run:       " << (train ? "train" : "test") << " phase\n";
    cerr << "Alg:       " << algnr << endl;
  }

  if((labelline = (char *)malloc(sizeof(char)*BUFSIZE)) == 0) {
    cerr << "Not enough memory (1)\n";
    exit(1);
  }
  if((buf = (char *)malloc(sizeof(char)*BUFSIZE)) == 0) {
    cerr << "Not enough memory (2)\n";
    exit(1);
  }


  if (train) {
    switch (algnr) {
      case 1:
      // open the names file and find the first label in the 
      // class label definition
	getNames(namesfile);
	getFirstLabel(labelline,targetlabel);
      // write the label to the model file
	writeLabel(modelfile,targetlabel);
      break;
      
      case 2:
	// get label list
	getNames(namesfile);
        // create counter table
	makeTab(labelline);
	//for (i = 0; i < nlabels; i++)
	//  cout << "label number " << i << " is " << labels[i] << endl;

	// read data file
	ifstream in(datafile);
	if (!in.is_open()) {
	  cerr << "Couldnt open data file\n";
	  exit(1);
	}
	unsigned long int n = 0;
	while(in.getline(buf,BUFSIZE,'\n')) {
	  n++;
	  // pLabel(buf) == oneOfLabels -> increase counter
	  i = 0;
	  while (i < nlabels) {
	    if (strcmp(labels[i],pLabel(buf)) == 0) {
	      counts[i]++;
	      break;
	    }
	    i++;
	  }
	  if (i==nlabels) {
	    cerr << "label " << pLabel(buf) << "not found in line " << n << endl;
	    exit(1);
	  }
	    

	  if (n == maxn) break;
	}
	// find label with highest counter
	unsigned long max = 0; 
	unsigned int maxnr = 0;
	for(i=0;i<nlabels;i++) {
	  if (verbose)
	    cerr << "Label " << labels[i] << ": " << counts[i] << endl;
	  if (counts[i]>max) {
	    max = counts[i];
	    maxnr = i;
	  }
	}
	// write model file
	writeLabel(modelfile,labels[maxnr]);
      break;
    }
  } else { // if test
    switch (algnr) {
      // for both 1 and 2, testing phase is same
      case 1:
      case 2:
      { 
	getLabel(modelfile,targetlabel);
	// read testing file
	ifstream in(datafile);
	if (!in.is_open()) {
	  cerr << "Couldnt open test file\n";
	  exit(1);
	}
	ofstream out(predfile);
	if (!out.is_open()) {
	  cerr << "Couldnt open prediction file" << endl;
	  exit(1);
	}

	if (out == 0) {
	  cerr << "Couldnt open prediction file\n";
	  exit(1);
	}
	unsigned long int n = 0;
	unsigned long int incorrect = 0;
	while(in.getline(buf,BUFSIZE,'\n')) {
	  out << targetlabel << endl;
	  n++;
	  cout << ">" << pLabel(buf) << "<  (" << targetlabel << ")\n";
	  if (strcmp(targetlabel,pLabel(buf)) != 0) {
	    incorrect++;
	  }
	}
	cout << "Error: " << ((double)incorrect / (double)n) << endl;
	out.close();
      }
      break;
    } // switch
  } // if train
  
}












