/************************************************************************/
/* simstr.c: find frequently appearing string patterns from a string, by 
   collecting frequently appearing short substrings and extend it unless the 
   "voting" does not succeed (the mojority letter in a position of them is
   not determined) */
/* 17/Mar/2010 Takeaki Uno  */

#ifndef _simset_c_
#define _simset_c_

#ifdef _NO_MAIN_
#define _simset_c_no_main_
#else
#define _NO_MAIN_
#endif

#define WEIGHT_DOUBLE
#define USE_MATH

#include<string.h>
#include"fstar.c"
#include"trsact.c"
#include"sgraph.c"
#include"grhfil.c"
#include"sspc.c"
#include"mace.c"
#include"medset.c"

int SIMSET_repeat=0, SIMSET_ID=1, SIMSET_deg_lb=0, SIMSET_tpose=0;
int SIMSET_frq_ub=0, SIMSET_ub=INTHUGE, SIMSET_hist = 0, SIMSET_ratio=0;
char *SIMSET_infname, *SIMSET_outfname, SIMSET_com, SIMSET_com2, *SIMSET_workdir="";
double SIMSET_th = 0.0, SIMSET_vote_th=0.5, SIMSET_th2=0.0, SIMSET_th3=0.0;
char SIMSET_sep = 0, *SIMSET_outperm_fname = NULL, *SIMSET_edge="", SIMSET_prog[10];
int SIMSET_no_remove=0, SIMSET_siz_lb=0, SIMSET_mes=1, SIMSET_append=0;
int SIMSET_leave_tmp_files=0, SIMSET_skip=0;
char *SIMSET_mes2 = "";

/* error routine */
void SIMSET_error (){
  ERROR_MES = "command explanation";
  print_err ("simset ISCMOt set-filename similarity-threshold degree-threshold output-filename\n\
%%:show progress, _:no message, +:write solutions in append mode, =:do not remove temporal files\n\
@:skip similarity computation (give similarity graph)\n\
i:set similarity measure to the ratio of one is included in the other\n\
I:set similarity measure to the ratio of both are included in the other\n\
S:set similarity measure to |A\\cap B|/max(|A|,|B|)\n\
s:set similarity measure to |A\\cap B|/min(|A|,|B|)\n\
C:set similarity measure to the cosign distance, the inner product of the normalized characteristic vectors\n\
T:set similarity measure to the intersection size, i.e., |A\\cap B|\n\
R:(recemblance) set similarity measure to |A\\cap B|/|A\\cup B|\n\
M:output intersection of each clique, instead of IDs of its members\n\
v (with M): output ratio of records, including each item\n\
m:do not remove edges in the data cleaning phase\n\
O:repeatedly similarity clustering until convergence\n\
t:transpose the input database, so that each line will be considered as a record\n\
E:read edge list file\n\
\n[options]\n\
-M [num]:merge similar cliques of similarity in [num] of recemblance (changes to 'x' by giving '-Mx')\n\
-m [num]:take independently cliques from similar cliques of similarity in [num] of recemblance, and merge the neighbors of each independent clique ('recemblance' changes to 'x' by giving '-Mx')\n\
-v [num]:specify majority threshold (default=0.5)\n\
-u [num]:ignore records of size larger than [num]\n\
-l [num]:output clusters with size at least [num]\n\
-U [num]:ignore items of frequency larger than [num]\n\
-O [num]:specify the number of repetitions\n\
-W [dir]:specify the working directory (folder). The last letter of the directory has to be '/' ('\\')\n\
-, [char]:give the separator of the numbers in the output\n\
-Q [filename]:replace the output numbers according to the permutation table given by [filename]\n\
# the 1st letter of input-filename cannot be '-'.\n\
if similarity-threshold is 0, skip the similarity graph construction phase\n");
  EXIT;
}

/* read parameters given by command line */
void SIMSET_read_param (int argc, char *argv[]){
  int c=1;
  SIMSET_prog[0] = 0;

  if ( argc<c+5 ){ SIMSET_error (); return; }
  if ( strchr (argv[c], '_') ){ strcat (SIMSET_prog, "_"); SIMSET_mes = 0; SIMSET_mes2 = " -_";}
  if ( strchr (argv[c], '%') ) strcat (SIMSET_prog, "%");
  if ( strchr (argv[c], '+') ) SIMSET_append = 1;
  if ( strchr (argv[c], '=') ) SIMSET_leave_tmp_files = 1;
  if ( strchr (argv[c], '@') ) SIMSET_skip = 1;

  if ( strchr (argv[1], 'I') ) SIMSET_com = 'I';
  else if ( strchr (argv[1], 'i') ) SIMSET_com = 'i';
  else if ( strchr (argv[1], 'C') ) SIMSET_com = 'C';
  else if ( strchr (argv[1], 'T') ) SIMSET_com = 'T';
  else if ( strchr (argv[1], 'S') ) SIMSET_com = 'S';
  else if ( strchr (argv[1], 'R') ) SIMSET_com = 'R';
  else if ( strchr (argv[1], 's') ) SIMSET_com = 's';
  if ( strchr (argv[1], 'M') ) SIMSET_ID = 0;
  if ( strchr (argv[1], 'm') ) SIMSET_no_remove = 1;
  if ( strchr (argv[1], 'O') ) SIMSET_repeat = 10000000;
  if ( strchr (argv[c], 't') ) SIMSET_tpose = 1;
  if ( strchr (argv[c], 'H') ) SIMSET_hist = 1;
  if ( strchr (argv[c], 'v') ) SIMSET_ratio = 1;
  if ( strchr (argv[c], 'E') ) SIMSET_edge = "e2";
  c++;

  while ( argv[c][0] == '-' ){
    if ( argc<c+5 ){ SIMSET_error (); return; }
    switch ( argv[c][1] ){
      case 'v': if ( (SIMSET_vote_th = atof(argv[c+1])) <= 0 )
          error_num("the majority threshold has to be positive", atof(argv[c+1]), EXIT);
      break; case 'm': if ( (SIMSET_th3 = atof(argv[c+1])) <= 0 )
          error_num("the independent set threshold has to be positive", atof(argv[c+1]), EXIT);
          if ( argv[c][2] ) SIMSET_com2 = argv[c][2]; else SIMSET_com2 = 'R';
      break; case 'M': if ( (SIMSET_th2 = atof(argv[c+1])) <= 0 )
          error_num("the merge threshold has to be positive", atof(argv[c+1]), EXIT);
          if ( argv[c][2] ) SIMSET_com2 = argv[c][2]; else SIMSET_com2 = 'R';
      break; case 'u': SIMSET_ub = atoi(argv[c+1]);
      break; case 'l': SIMSET_siz_lb = atoi(argv[c+1]);
      break; case 'U': SIMSET_frq_ub = atoi(argv[c+1]);
      break; case 'O': SIMSET_repeat = atoi(argv[c+1]);
      break; case 'W': SIMSET_workdir = argv[c+1];
      break; case ',': SIMSET_sep = argv[c+1][0];
      break; case 'Q': SIMSET_outperm_fname = argv[c+1];
      break; default: goto NEXT;
   }
    c += 2;
  }

  NEXT:;
  SIMSET_infname = argv[c];    // input file name
  SIMSET_th = atof(argv[c+1]);  // similarity threshold
  SIMSET_deg_lb = atoi(argv[c+2]);  // threshold for clique size
  SIMSET_outfname = argv[c+3];  // output file name
}

/******************************************************************************/
int SIMSET_main (int argc, char *argv[]){
  int flag=1, i=1, count=0;
  size_t siz2, siz=0;
  char *W = SIMSET_workdir, s1[1000];

    // read params
  SIMSET_read_param (argc, argv);
  if ( ERROR_MES ) return (1);

  if ( SIMSET_skip ) goto FIND_CLIQUE;
    // remove small degree vertices
//  s1[0] = 0; if ( SIMSET_frq_ub > 0 ) sprintf (s1, " -I %d", SIMSET_frq_ub);
//  EXECSUB (GRHFIL_main, goto END,
//      "grhfil B%c%s -o %d -O %d \"%s\" \"%s__tmp__\"",
//      SIMSET_tpose?'D':'d', s1, (int)SIMSET_th, (int)SIMSET_ub, SIMSET_infname, W);
  EXECSUB (GRHFIL_main, SIMSET_mes, goto END,
      "grhfil %sB%c%s \"%s\" \"%s__tmp__\"", SIMSET_prog, SIMSET_tpose?'D':'d', SIMSET_edge, SIMSET_infname, W);

  if ( SIMSET_th == 0.0 ){
    // clique mining
    EXECSUB (MACE_main, SIMSET_mes, goto END, "mace %sM -l %d \"%s__tmp__\" \"%s__tmp_out2__\"", SIMSET_prog, SIMSET_deg_lb, W, W);
    goto NEXT;
  }
    // find similar pairs
  s1[0] = 0; if ( SIMSET_frq_ub > 0 ) sprintf (s1, " -U %d", SIMSET_frq_ub);
  do {
    count ++;
    if ( SIMSET_mes ) print_err ("%dth-iter\n", i++);
    siz2 = siz;
    if ( i==2 ){
      EXECSUB (SSPC_main, SIMSET_mes, goto END, "sspc %s%c%s -l %d -u %d \"%s__tmp__\" %f \"%s__tmp_out__\"",
          SIMSET_prog, SIMSET_com, s1, (int)SIMSET_th, (int)SIMSET_ub, W, SIMSET_th, W);
    } else {
      EXECSUB (SSPC_main, SIMSET_mes, goto END, "sspc %s%c \"%s__tmp__\" %f \"%s__tmp_out__\"",
          SIMSET_prog, SIMSET_com, W, SIMSET_th, W);
    }
    siz = internal_params.l1;
  
    if ( siz == 0 ){ print_err (" no similar pair exists"); exit(1); }
    if ( !SIMSET_repeat || SIMSET_repeat == count || siz == siz2 || i>= 20 ) break;      // repeat until convergence

    EXECSUB (GRHFIL_main, SIMSET_mes, goto END,"grhfil %sue -o %d \"%s__tmp_out__\" \"%s__tmp__\"",
        SIMSET_prog, (int)SIMSET_th, W, W);
  } while (1);    //  repeat at most 20 times

     // transform to graph format
  s1[0] = 0; 
  if ( SIMSET_no_remove == 1 ){
    EXECSUB (GRHFIL_main, SIMSET_mes, goto END, "grhfil %sue2 -t %d -M %s \"%s__tmp_out__\" \"%s__tmp__\"", SIMSET_prog, (int)SIMSET_deg_lb, SIMSET_infname, W, W);
  } else EXECSUB (GRHFIL_main, SIMSET_mes, goto END, "grhfil %sue2 -t %d \"%s__tmp_out__\" \"%s__tmp__\"", SIMSET_prog, (int)SIMSET_deg_lb, W, W);

  FIND_CLIQUE:;
    // clique mining
//  EXECSUB (MACE_main, goto END, "mace Me -l %d \"%s__tmp_out__\" \"%s__tmp_out2__\"", SIMSET_deg_lb, W, W);
  EXECSUB (MACE_main, SIMSET_mes, goto END, "mace %sM -l %d \"%s%s\" \"%s__tmp_out3__\"",
   SIMSET_prog, SIMSET_deg_lb, W, SIMSET_skip? SIMSET_infname: "__tmp__", W);
  EXECSUB (GRHFIL_main, SIMSET_mes, goto END, "grhfil ds \"%s__tmp_out3__\" \"%s__tmp_out2__\"",
      W, W);

  NEXT:;

    // merge similar cliques
  if ( SIMSET_th2 > 0.0 || SIMSET_th3 > 0.0 ){
    sprintf (common_comm, "%s__tmp_out2__", W);
    rename (common_comm, SIMSET_outfname);
    EXECSUB (SSPC_main, SIMSET_mes, goto END, "sspc %s%c \"%s\" %f \"%s__tmp2__\"",
       SIMSET_prog, SIMSET_com2, SIMSET_outfname, SIMSET_th2+SIMSET_th3, W);
    EXECSUB (MEDSET_main, SIMSET_mes, goto END, "medset%s -l %d -%c \"%s__tmp2__\" \"%s\" 1 \"%s__tmp_out2__\"",
          SIMSET_mes2, SIMSET_siz_lb, SIMSET_th2>0.0? 'T': 'I', W, SIMSET_outfname, W);
  }
    // take intersection
  if ( !SIMSET_ID ){
    EXECSUB (MEDSET_main, SIMSET_mes, goto END, "medset%s %s-l %d %s%s\"%s__tmp_out2__\" \"%s\" %f \"%s\"",
        SIMSET_mes2, SIMSET_ratio?"-i ": "", SIMSET_siz_lb, SIMSET_tpose?"-t ":"",
         SIMSET_hist?"-H ":"", W, SIMSET_infname, SIMSET_vote_th, SIMSET_outfname);
    if ( !SIMSET_leave_tmp_files ) REMOV (W, "__tmp_out2__");
  } else {
    sprintf (common_comm, "%s__tmp_out2__", W);
    rename (common_comm, SIMSET_outfname);
  }
  flag = 0;

///////////////////////////////////////////////////////////
  END:;
  if ( !SIMSET_leave_tmp_files ) MREMOV (W, "__tmp__", "__tmp2__", "__tmp_out__", "__tmp_out2__");
  return (flag);
}

/*******************************************************************************/
#ifndef _simset_c_no_main_
int main (int argc, char *argv[]){
  return (SIMSET_main (argc, argv) );
}
#endif
/*******************************************************************************/

#endif




