/* take the intersection of each cluster */
/* 8/Nov/2008   Takeaki Uno  */

#ifndef _medset_c_
#define _medset_c_

#include"fstar.c"
#include"problem.c"

#define MEDSET_CC 4
#define MEDSET_IND 8
#define MEDSET_NO_HIST 16
#define MEDSET_RATIO 32

void MEDSET_error (){
  ERROR_MES = "command explanation";
  print_err ("medset: compute the intersection of each set of sets\n\
usage: medset [-HRTIitl] cluster-filename set-filename threshold output-filename\n\
-%%: show progress\n\
-_: no message\n\
-H: do not use histgram, just output the items\n\
-R: do not output singleton clusters\n\
-T: clustering by connected component (read edge type file)\n\
-I: find an independent set, and clustering by using the vertices in it as seeds (read edge type files)\n\
-i: output for each item, ratio of records including the item\n\
-t: transpose the input database, (transaction will be item, and vice varsa)\n\
-l [num]: output clusters of size at least [num]\n\
# the 1st letter of input-filename cannot be '-'.\n");
  EXIT;
//-c [num]: specify the connectivity for the connected component clustering (-T)\n
}

/* read commands and options from command line */
void MEDSET_read_param (PROBLEM *PP, int argc, char *argv[]){
  int c=1;

  if ( argc < c+3 ){ MEDSET_error (); return; }
  PP->dir = 1; PP->FS.flag |= SHOW_MESSAGE;
  while ( argv[c][0] == '-' ){
    if ( argc<c+3 ){ MEDSET_error (); return; }
    switch ( argv[c][1] ){
      case 't': PP->FS.flag |= LOAD_TPOSE;
      break; case '_': PP->FS.flag -= SHOW_MESSAGE;   // connected component clustering
      break; case '%': PP->FS.flag |= SHOW_PROGRESS;   // connected component clustering
      break; case 'T': PP->problem |= MEDSET_CC;   // connected component clustering
      break; case 'I': PP->problem |= MEDSET_IND;   // independent set clustering
      break; case 'H': PP->problem |= MEDSET_NO_HIST;   // do not use histgram
      break; case 'l': PP->num = atoi(argv[c+1]); c++;   // least cluster size
      break; case 'i': PP->problem |= MEDSET_RATIO;   // output included-ratio of items
//      break; case 'c': PP->deg = atoi(argv[c+1]); c++;   // least degree
   }
    c++;
  }
  
  PP->input_fname = argv[c];
  PP->FS.fname = argv[c+1];
  PP->th = atof(argv[c+2]);
  PP->output_fname = argv[c+3];
}

/* read file, output the histogram of each line */
void MEDSET_read_file (PROBLEM *PP, FILE2 *fp){
  FSTAR *FS = &PP->FS;
  FSTAR_INT *cnt, *que, t, s, i, x;

  calloc2 (cnt, FS->in_node_num, EXIT);
  calloc2 (que, FS->in_node_num*2, goto END);
  
  do {
    s = t = 0;
    do {   // count #out-going edges for each vertex
      x = (FSTAR_INT)FILE2_read_int (fp);
      if ( FILE_err&4 ) break;
      if ( x<0 || x >= FS->out_node_num ){
        print_err ("set ID out of bound %d>%d\n", x, FS->out_node_num);
        exit(0);
      }
      FLOOP (i, FS->fstar[x], FS->fstar[x+1])
          if ( cnt[FS->edge[i]]++ == 0 ){ que[t*2+1] = FS->edge[i]; t++; }
      s++;
    } while ( (FILE_err&3)==0 );
    if ( s>0 ){
      FLOOP (i, 0, t){ que[i*2] = cnt[que[i*2+1]]; cnt[que[i*2+1]] = 0; }
      qsort_FSTAR_INT (que, t, -((int)sizeof(FSTAR_INT))*2);
/*
      FLOOP (i, 0, t){
        if ( ((double)cnt[que[i*2+1]])/(double)s >= PP->th )
            fprintf (PP->II.fp, "(%d:%.2f) ", que[i], ((double)cnt[que[i]])/(double)s);
        cnt[que[i]] = 0;
      }
*/
      FLOOP (i, 0, t){
        if ( ((double)que[i*2])/(double)s < PP->th ) break;
        if ( PP->problem & MEDSET_NO_HIST ) fprintf (PP->II.fp, "%d ", que[i*2+1]);
        else if ( PP->problem & MEDSET_RATIO ) fprintf (PP->II.fp, "(%d:%.2f) ", que[i*2+1], ((double)que[i*2])/(double)s);
        else fprintf (PP->II.fp, "%d ", que[i*2+1]);
      }
    }
    fprintf (PP->II.fp, "\n");
  } while ( (FILE_err&2)==0 );

  END:;
  mfree (cnt, que);
}

/* ascend the tree and */
FSTAR_INT MEDSET_ascend (FSTAR_INT *pnt, FSTAR_INT x){
  FSTAR_INT y=x, z;
  while (pnt[y] != y) y=pnt[y];
  while (pnt[x] != y){ z=pnt[x]; pnt[x] = y; x = z;}
  return (y);
}

/* output clusters to the output file */
void MEDSET_print_clusters (PROBLEM *PP, char *mark, FSTAR_INT *pnt, FSTAR_INT xmax){
  FSTAR_INT i, x, y, end, *cls=NULL;
  FSTAR *FS = &PP->FS;

  FLOOP (i, 0, xmax)
      if ( mark[i] == 0 ){ x = pnt[i]; pnt[i] = pnt[x]; pnt[x] = i; }

    // allocate memory for union-computation
  if ( !(PP->problem & MEDSET_NO_HIST) ) calloc2 (cls, FS->ymax+2, EXIT);

    // print the clusters
  FLOOP (i, 0, xmax){
    if ( mark[i] == 0 ) continue;
    for (y=1,x=i ; x != i ; x=pnt[x] ) y++;
    if ( y < PP->num ) continue;   // lower than the least cluster size
    x = i; end = 0;
    do {
      if ( PP->problem & MEDSET_NO_HIST ) fprintf (PP->II.fp, "%d ", x);
      else FLOOP (y, FS->fstar[x], FS->fstar[x+1]){
        if ( (mark[FS->edge[y]]&2) == 0 ){
          mark[FS->edge[y]] |= 2;
          cls[end++] = FS->edge[y];
          fprintf (PP->II.fp, "%d ", FS->edge[y]);
        }
      }
      x = pnt[x];
    } while ( x != i );
    fprintf (PP->II.fp, "\n");
    if ( !(PP->problem & MEDSET_NO_HIST) )
        FLOOP (y, 0, end) mark[cls[y]] &= 1;
  }
  free2 (cls);
}

/* read file, output the histogram of each line */
void MEDSET_cc_clustering (PROBLEM *PP, FILE2 *fp){
  FSTAR_INT x, y, *pnt=NULL, i, end=0, eend, xmax=0;
  char *mark = NULL;
  FSTAR *FS = &PP->FS;

    // merge the connponents to be connected by using spray tree
  while (1){
    x = (FSTAR_INT)FILE2_read_int (fp);
    if ( FILE_err&2 ) break;
    if ( FILE_err&4 ) continue;
    y = (FSTAR_INT)FILE2_read_int (fp);
    if ( FILE_err&4 ) continue;
    FILE2_read_until_newline (fp);

    ENMAX (xmax, MAX(FS->xmax, MAX(x, y)+1)); 
    eend = end; reallocz (pnt, end, xmax, goto END);  // expand and initilize pnt
    while ( eend < end ){ pnt[eend] = eend; eend++; }
    x = MEDSET_ascend (pnt, x);
    y = MEDSET_ascend (pnt, y);
    if ( x != y ){
      if ( x < y) pnt[y] = x; else pnt[x] = y;
    }
  }

    // convert spray tree to a star
  FLOOP (i, 0, end) MEDSET_ascend (pnt, i);

    // convert from spray tree to linked lists of clusters
  calloc2 (mark, MAX(xmax, FS->ymax)+2, goto END);
  FLOOP (i, 0, xmax) if ( pnt[i] == i ) mark[i] = 1;   // mark to the seeds

  MEDSET_print_clusters (PP, mark, pnt, xmax);

  END:;
  mfree (pnt, mark);
}


/* clustering the nodes by finding independent set */
void MEDSET_ind_clustering (PROBLEM *PP, FILE2 *fp){
  FSTAR_INT x, y, i=0, end=0, xmax=0, flag, *pnt=NULL;
  char *mark=NULL;
  FSTAR *FS = &PP->FS;
  int LOOP=1000;

    // independent set
  for (i=0 ; i<LOOP-2 ; i++){
    FILE2_reset (fp); flag = 1;
    while (1){
      x = (FSTAR_INT)FILE2_read_int (fp);
      if ( FILE_err&2 ) break;
      if ( FILE_err&4 ) continue;
      y = (FSTAR_INT)FILE2_read_int (fp);
      if ( FILE_err&4 ) continue;
      FILE2_read_until_newline (fp);

      ENMAX (xmax, MAX(FS->xmax, MAX(x, y)+1)); 
      reallocx (pnt, end, xmax, 0, goto END);  // expand and initilize pnt
      if ( pnt[x] <= i && pnt[y] <= i ){
        flag = 0;
        if ( x < y ) pnt[y] = i+1; else pnt[x] = i+1; // delete one of (larger) active node if two active nodes are adjacent
      }
    }
    if ( flag ) break; // no change occured

    FILE2_reset (fp);
    while (1){
      x = (FSTAR_INT)FILE2_read_int (fp);
      if ( FILE_err&2 ) break;
      if ( FILE_err&4 ) continue;
      y = (FSTAR_INT)FILE2_read_int (fp);
      if ( FILE_err&4 ) continue;
      FILE2_read_until_newline (fp);

      if ( pnt[x] <= i || pnt[y] <= i ){
          if ( pnt[x] <= i ) pnt[y] = x+LOOP; else pnt[x] = y+LOOP;  // delete node (larger) active node if two active nodes are adjacent
      }
    }
  }

    // allocate memory for linked list
  calloc2 (mark, MAX(xmax, FS->ymax)+1, goto END);

    // make clusters
  FLOOP (i, 0, xmax){
    if ( pnt[i] < LOOP ){ mark[i] = 1; pnt[i] = i; }   // mark to the seeds
    else pnt[i] -= LOOP;
  }
  
  MEDSET_print_clusters (PP, mark, pnt, xmax);

  END:;
  mfree (mark, pnt);
}


/*******************************************************************/
int MEDSET_main (int argc, char *argv[]){
  PROBLEM PP;
  FILE2 fp;
  
  PROBLEM_init (&PP);
  MEDSET_read_param (&PP, argc, argv);
if ( ERROR_MES ) return (1);
  PP.FS.flag |= LOAD_BIPARTITE;
  PP.FS.edge_dir = 1;
  print_mes (&PP.FS, "medset: cluster-file= %s set-file= %s threshold= %f output-file= %s\n", PP.input_fname, PP.FS.fname, PP.th, PP.output_fname);
  PROBLEM_load (&PP);
  
  FILE2_open (fp, PP.input_fname, "r", goto END);
  fopen2 (PP.II.fp, PP.output_fname, "w", goto END);

  if ( !ERROR_MES ){
    if ( PP.problem & MEDSET_CC ) MEDSET_cc_clustering (&PP, &fp);
    else if ( PP.problem & MEDSET_IND ) MEDSET_ind_clustering (&PP, &fp);
    else MEDSET_read_file (&PP, &fp);
  }

  END:;
  FILE2_close (&fp);
  fclose2 (PP.II.fp);

  PROBLEM_end (&PP);
  return (ERROR_MES?1:0);
}


/*******************************************************************************/
#ifndef _NO_MAIN_
#define _NO_MAIN_
int main (int argc, char *argv[]){
  return (MEDSET_main (argc, argv));
}
#endif
/*******************************************************************************/

#endif


