/*
   NAME
   freedup -- search for duplicate files in one or more directory hierarchies
   Copyright (C) 1990, 91, 92, 93, 94, 2000, 2003, 2007 Free Software Foundation, Inc.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 9 Temple Place - Suite 330, Boston, MA 02111-1307,
   USA.*/

/* GNU freedup was written by Andreas Neuper <ANeuper@web.de>  */
#include "freedup.h"

/* Name this program was called with. */
char *program_name;


/* SYNOPSIS */
void usage(void)
{
   printf(  "USAGE:\t%s [options] <dir1> [<dir2> ...]\n\n", program_name );
   printf(  "\t-a\t   provide compatibility to freedups by William Stearns.[=-up]\n"
   	    "\t-c\t   count file space savings.\n"
	    "\t-d\t   requires the modification time stamps to be equal.\n" 
	    "\t-f\t   requires the path-stripped file names to be equal.\n"
	    "\t-h\t   shows this help. [other option are ignored]\n"
	    "\t-m <bytes> only touch larger files. (deprecated: use -o \"-size +#\")\n" 
	    "\t-n\t   do not really perform links [no action].\n" );
   printf(  "\t-o <opts>  pass an option string to the initially called find command.\n"
	    "\t-p\t   requires file permissions to be equal.\n" 
	    "\t-s\t   generate symlinks although some given paths are relative.\n"
	    "\t-u\t   requires user & group to be equal.\n" 
	    "\t-v\t   display shell commands to perform linking [verbose].\n"
	    "\t<dir>\t   any directory to scan for duplicate files recursively.\n\n" );
   printf(  "\tOptions are toggle switches. Their final state applies.\n"
	    "\tLater <dir> entries are linked to the earlier ones.\n"
	    "\tVersion " VERSION " by Andreas Neuper (c)2007.\n\n"
         );
   return;
}
   
/* OPTION DEFAULTS  */
	int testmode=0;
	int showmode=0;
	int showsave=0;
	int sametime=0;
	int sameperm=0;
	int sameuser=0;
	int samename=0;
	int minbytes=0;
	int nosymlinks=0;
	char*findopta=NULL;

/* DESCRIPTION

   main() handles the command line parameters
	  scans for files to consider
	  builds a list of all files to consider
	  qsort()s the list of files
   comparison_by_size() is called for two files by qsort()
	  for equally sized files compares hash signature
	  comparison_by_content() when the same size & hash
   comparison_by_content() is only called by comparison_by_size()
	  reads and compares both files block by block
	  determines priority, i.e. later args count less
	  calls link_files() for identical files
   link_files() replaces the second file by a link to the first
	  determines whether both files are on the same fs
	  uses inode hard links on the same device number
	  uses symlinks if device numbers differ

   Additionally various helper functions are deployed.
*/

#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#else
#include <sys/file.h>
#endif

#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#include <assert.h>

void perror(const char *s);
#include <errno.h>
int errno;

/* internal counters */

/* link actions performed */
int totalinked=0;
int symlinked=0;
unsigned long long linksaved=0;

/* The number of regular files to scan. */
int filecount=0;


/* Time at start of execution.  */
time_t start_time;

/* Seconds between 00:00 1/1/70 and either one day before now
   (the default), or the start of today (if -daystart is given). */
time_t cur_day_start;

/* Status value to return to system. */
int exit_status;

/* The array pointer we use for sorting. */
finfo*file_info;


#ifdef MD5SUM
int hashreject=0;
int hashread=0;

static void gethash(finfo*a)
{
    char buffer[MAXPATHLENGTH+MATCHCNT+4];
    FILE*file2hash;
    int n=0;
    hashread++;
    /*
     * this routine uses external shell commands
     */
    sprintf(buffer,MD5SUM " \"%s\" 2>/dev/null", a->name);
    file2hash = popen (buffer, "r");
    assert(file2hash!=NULL);
    fgets(buffer,MAXPATHLENGTH,file2hash);
    a->hash = calloc(MATCHCNT+2,sizeof(char));
    if( feof(file2hash) )
      {
	/*  hash was unsuccessful;
	 *  usually fails due to special characters in the file name
	 *  (this was at least the reason for redirecting errors)
	 *  write std dummy value;
	 */
        for(n=0;n<MATCHCNT;n++)
	   (a->hash)[n]='0';
	/*  it would be better (correct)
	 *  to link it to some temporary file,
	 *  to evaluate it, and to continue then
	 */
      } else
      {
	/*  if everything works fine
	 *  there should appear 32 characters
	 *  followed by two spaces and the file name
	 */
        assert(buffer[MATCHCNT]==' ');
        assert(buffer[MATCHCNT+1]==' ');
        if(0) printf("MD5=%s\n", buffer);
	/* store the hash value to avoid recalculation */
        strncpy(a->hash,buffer,MATCHCNT);
      }
    pclose(file2hash);
    /* the string was already initialised with zero
     * since I am somewhat paranoid ...
     */
    a->hash[MATCHCNT]=0;
    return;
}
#endif /* MD5SUM */

void statcpy(finfo*a)
{
    struct stat*astat;
    astat=&(a->info);
#ifdef DEBUG
    printf("%s(%x:%d:%d) --> \n", a->name, (int)astat->st_dev, (int)astat->st_ino, (int)astat->st_blocks);
#endif /* DEBUG */
    if(lstat(a->name, astat) != 0)
      {
	perror("lstat() failed while reading file statistics");
	exit(-1);
      }
#ifdef DEBUG
    printf("\t\t%s(%x:%d:%d)\n", a->name, (int)astat->st_dev, (int)astat->st_ino, (int)astat->st_blocks);
#endif /* DEBUG */
    return;
}

void link_files(finfo*a, finfo*b)
{
    char tmpfile[MAXPATHLENGTH+16];
    if( S_ISLNK(a->info.st_size) || S_ISLNK(b->info.st_mode) )
        /* either one is already symlinked
	 * this avoids building up symlink chains (in case of a)
	 * or alternating symlinks during sort (in case of b)
	 */
	return;
    if(nosymlinks && (a->info.st_dev != b->info.st_dev) )
        /* when on different devices and symlinks are not allowed
	 * do not link, i.e. no also not prepare to link.
	 * This needs to be catched before any execution.
	 */
	return;
#ifdef DEBUG
    printf("LINK: %s(%d/%s) <-- %s(%d/%s)\n",
	a->name, (int)a->info.st_size, a->hash,
	b->name, (int)b->info.st_size, b->hash);
#endif /* DEBUG */
    if(! testmode )
      {
	/* move file to tmp ; link it ; remove tmp
	 */
	strcpy(tmpfile,b->name);
	strcat(tmpfile,".tmp");
	rename(b->name,tmpfile);
      }
    if( a->info.st_dev == b->info.st_dev )
      {
	/* either st_dev or st_ino are different,
	 * was checked earlier prior to call
	 */
	if( showmode) printf("ln \"%s\" \"%s\"\n",a->name,b->name);
	if(!testmode) link(a->name,b->name);
        statcpy(b);
      } else
      {
	assert(!nosymlinks);
	if( showmode) printf("ln -s \"%s\" \"%s\"\n",a->name,b->name);
	if(!testmode) symlink(a->name,b->name);
	symlinked++;
        statcpy(b);
      }
    if( showsave) 
      {
	printf("Linking \"%s\" saves %d bytes\n",
		basename(b->name), 
		(int)(b->info).st_size);
      }
    linksaved+=(int)(b->info).st_size;
    if(!testmode)
      {
	unlink(tmpfile);
      }
    totalinked++;
    return;
}

int comparison_by_content(const void*a,const  void*b)
{
    FILE*afile=NULL,
	*bfile=NULL;
    char ablock[BLOCK+2],
	 bblock[BLOCK+2];
    int  aread,
	 bread,
	 result;
    assert(a!=NULL);
    assert(b!=NULL);
#ifdef DEBUG
    printf("CONTENT: %s(%d/%s) <> %s(%d/%s)\n",
	(((finfo*)a)->name),(((finfo*)a)->info.st_size),(((finfo*)a)->hash),
	(((finfo*)b)->name),(((finfo*)b)->info.st_size),(((finfo*)b)->hash));
#endif /* DEBUG */
    afile=fopen(((finfo*)a)->name,"r");
    bfile=fopen(((finfo*)b)->name,"r");
    while(!(feof(afile)||feof(bfile)))
      {
	aread=fread(ablock,BLOCK,1,afile);
	bread=fread(bblock,BLOCK,1,bfile);
	assert(aread==bread);
	assert((aread!=0)||feof(afile));
	result=memcmp(ablock,bblock,aread);
	if( result!=0 )
	  {
	    fclose(afile);
	    fclose(bfile);
	    return(result);
          }
      }
    /* decide which has higher prio by
     * 1) earlier named in command line
     * NOT: 2) shorter path length /+/ may result in alternating linkage
     * the one with lower priority will be deleted
	 */
    if(((finfo*)a)->prio<((finfo*)b)->prio)
      {
	link_files(((finfo*)a),((finfo*)b));
      } else
      {
	if(((finfo*)a)->prio>((finfo*)b)->prio)
	  {
	    link_files(((finfo*)b),((finfo*)a));
	  } else
	  {
/**/
/* it did not work well to give priority within one file system tree	*/
/*	    if(strlen(((finfo*)a)->name)<strlen(((finfo*)b)->name))	*/
/**/
	    /* We should elimate unnlinked files and
             * keep more often linked files as collectors
             */
	    if(((finfo*)a)->info.st_nlink > ((finfo*)b)->info.st_nlink )
	        link_files(((finfo*)a),((finfo*)b));
	    else
	        link_files(((finfo*)b),((finfo*)a));
	  }
      }
    fclose(afile);
    fclose(bfile);
    return(0);
}

static int comparison_by_size(const void*a, const void*b)
{
    static int sizediff;
    assert(a!=NULL);
    assert(b!=NULL);
#ifdef DEBUG
    printf("%s(%d) <> %s(%d)\n",
	(((finfo*)a)->name),(((finfo*)a)->info.st_size),
	(((finfo*)b)->name),(((finfo*)b)->info.st_size));
#endif /* DEBUG */
    sizediff = (( (((finfo*)a)->info.st_size) 
                - (((finfo*)b)->info.st_size) ));
    /* avoid work when not desired
     * follow the minbytes directive
     */
    if(minbytes>(((finfo*)a)->info.st_size))
      {
        return(sizediff);
      }
    /* up to here it is obvious:
     * - different size different file
     * - same size -> differs file?
     */
    if(sizediff == 0)
      {
#ifdef DEBUG
    printf("COMPARE: %x:%d <> %x:%d\n",
	(int)(((finfo*)a)->info.st_dev),(int)(((finfo*)a)->info.st_ino),
	(int)(((finfo*)b)->info.st_dev),(int)(((finfo*)b)->info.st_ino));
#endif /* DEBUG */
	/* softlink must not result in endless link loops
	 */
	if( (((finfo*)a)->info.st_mode & S_IFMT) != S_IFREG 
	 || (((finfo*)b)->info.st_mode & S_IFMT) != S_IFREG )
	    /* same size but no linking allowed
	     */
	    return 0;
	/* compare the path-less file names
	 * if the samename directive requires this
	 */
	if( strcmp(basename(((finfo*)a)->name),basename(((finfo*)b)->name)) 
	 && (samename==1) )
	    /* different names do not allow linking
	     */
	    return 0;
	/* check whether the user and group entry match
	 */
	if( ((((finfo*)a)->info.st_uid)==(((finfo*)b)->info.st_uid)) 
	 && ((((finfo*)a)->info.st_gid)==(((finfo*)b)->info.st_gid))
	 &&  (sameuser==1) )
	    /* different names do not allow linking
	     */
	    return 0;
	/* check whether the permissions match
	 */
	if( ((((finfo*)a)->info.st_mode&S_IRWXUGO)==(((finfo*)b)->info.st_mode&S_IRWXUGO)) 
	 && (sameperm==1) )
	    /* different names do not allow linking
	     */
	    return 0;
	/* check whether the modification dates match
	 */
	if( ((((finfo*)a)->info.st_mtime)==(((finfo*)b)->info.st_mtime)) 
	 && (sametime==1) )
	    /* different names do not allow linking
	     */
	    return 0;
	/* check whether already hardlinked
	 */
	if( (((finfo*)a)->info.st_ino != ((finfo*)b)->info.st_ino)
	 || (((finfo*)a)->info.st_dev != ((finfo*)b)->info.st_dev) )
	  {
#ifdef MD5SUM
	    if((((finfo*)a)->hash)==NULL) gethash((finfo*)a);
	    if((((finfo*)b)->hash)==NULL) gethash((finfo*)b);
	    assert(((finfo*)a)->hash != NULL);
	    assert(((finfo*)b)->hash != NULL);
	    if( strncmp( (((finfo*)a)->hash), (((finfo*)b)->hash), MATCHCNT )!=0 )
	      {
		hashreject++;
	      } else
#endif
	      {
		comparison_by_content(a,b);
	      }
          }
      }
    return(sizediff);
}

#ifdef DEBUG
void print_infolist(void)
{
  int n;
  for(n=0;n<filecount;n++)
    {
      printf("%9d %x:%05d %s\n",
	(int)file_info[n].info.st_size,
	(int)file_info[n].info.st_dev,
	(int)file_info[n].info.st_ino,
	file_info[n].name);
    }
}

atexit()
{
    if(findopta!=NULL) free(findopta);
}
#endif /* DEBUG */

int
main (int argc, char *argv[])
{
  int c,i,n,localcnt,lastlinked=0;
  FILE*liste;
  char buffer[MAXPATHLENGTH];
  program_name = argv[0];
#ifdef HAVE_SETLOCALE
  setlocale (LC_ALL, "");
#endif
#ifdef DEBUG
  /* Locale is not used in current implementation
   * Time is not used in current implementation
	 */
  start_time = time (NULL);
  printf ("cur_day_start = %s", ctime (&cur_day_start));
#endif /* DEBUG */
  exit_status = 0;

  while ((c = getopt(argc, argv, "cdfhm:no:psuv")) != EOF) {
            switch (c) {
                  case 'a': sameperm   = 1-sameperm; 
                  	    sameuser   = 1-sameuser;   break;
                  case 'c': showsave   = 1-showsave;   break;
                  case 'd': sametime   = 1-sametime;   break;
                  case 'f': samename   = 1-samename;   break;
                  case 'h': usage();                 exit(0);
                  case 'm': minbytes   = atoi(optarg); break;
                  case 'n': testmode   = 1-testmode;   break;
                  case 'o': findopta   = calloc( strlen(optarg)+2, sizeof(char) );
			    strcpy( findopta, optarg );break;
                  case 'p': sameperm   = 1-sameperm;   break;
                  case 's': nosymlinks = 1-nosymlinks; break;
                  case 'u': sameuser   = 1-sameuser;   break;
                  case 'v': showmode   = 1-showmode;   break;
                  default:
			fprintf(stderr,"wrong option: \"%c\"\n",c);
                        usage();
                        exit(-1);
                  }
            }
#ifdef DEBUG
  printf("ARGC:%d\nOPTIND:%d\n",argc,optind);
#endif /* DEBUG */
	 
  /* Determine the number of files to scan. */
  for (i=optind;i < argc;i++)
    {
      sprintf(buffer,"find \"%s\" -type f %s -print | wc -l", argv[i],(findopta==NULL)?"":findopta);
      liste = popen (buffer, "r");
      fgets(buffer,MAXPATHLENGTH,liste);
      pclose(liste);
      filecount+=atoi(buffer);
    }
  printf("%d files to investigate\n\n",filecount);
  if( (argc <= optind) || (filecount < 2) )
    {
      usage();
      exit(0);
    }

  file_info = (finfo*)calloc(filecount+4,sizeof(finfo));
  assert(file_info!=NULL);
  for (i=optind,n=0;i < argc;i++)
    {
      if( argv[i][0] != '/' && (argc-optind) > 1 )
        {
	  fprintf(stderr,"WARNING: disabling symbolic links due to relative paths\n");
	  nosymlinks=1;
	}
      sprintf(buffer,"find \"%s\" -type f %s -print", argv[i],(findopta==NULL)?"":findopta);
      liste = popen (buffer, "r");
      for(;!feof(liste);localcnt--)
	{
          fgets(buffer,MAXPATHLENGTH,liste);
	  if(!feof(liste))
            {
	      file_info[n].name=(char*)malloc(strlen(buffer));
	      assert(file_info[n].name!=NULL);
	      assert(buffer[strlen(buffer)-1]=='\n');
	      buffer[strlen(buffer)-1]=0;
	      strcpy(file_info[n].name,buffer);
	      statcpy(&(file_info[n]));
	      file_info[n].prio=i-optind;
	      if(n>filecount)
	        { printf("N(%d) > Filecount(%d)\n",n,filecount);
		  file_info = realloc(file_info, sizeof(finfo)*filecount);
		}
	      n++;
            }
        }
      pclose(liste);
    }
  do{
      lastlinked=totalinked;
      qsort( file_info, filecount, sizeof(finfo), comparison_by_size );
fprintf(stderr,"sorting loop ended with %d:%d\n",lastlinked,totalinked);
    }
  while(lastlinked<totalinked && ! testmode);
  printf("%d file%s of %d %sreplaced by links.\nThe total size of replac%s files %s %lld bytes.\n",
	totalinked,(totalinked==1)?"":"s",filecount,testmode?"will be ":"",
	testmode?"able":"ed", testmode?"is":"was", linksaved);
#ifdef MD5SUM
  printf("md5 hash algorithm had to read %d files to avoid %d file comparisons.\n",hashread,hashreject);
#endif /* MD5SUM */
#ifdef DEBUG
  print_infolist();
#endif /* DEBUG */
  exit (exit_status);
}
