(* natsume.ml - Don Yang (uguu.org)

   Ported from natsume0.pl, go there for more comments.

   Because stat is not in the standard libraries, unix.cma or unix.cmxa
   is needed for the program to work:

      ocaml unix.cma natsume.ml
      ocamlc unix.cma natsume.ml -o natsume
      ocamlopt unix.cxma natsume.ml -o natsume

   Doesn't work for files larger than 2GB.

   12/26/05
                                                                           *)

(*----------------------------------------------- Path management routines *)

(* Directory component separator *)
let path_separator = '/';;
let path_separator_str = String.make 1 path_separator;;

(* Empty file *)
let empty_file = "/dev/null";;

(* Split string to list of components.  String.index could have been
   used instead, but the library implementation is very similar to
   what's here with unnecessary exception bits.                      *)
let rec split_path_recurse path index limit =
   if index >= limit - 1
   then [path]
   else
      if String.get path index = path_separator
      then
         let remaining = limit - index - 1 in
            (String.sub path 0 index) ::
            (split_path_recurse
               (String.sub path (index + 1) remaining) 0 remaining)
      else split_path_recurse path (index + 1) limit;;

let split_path path =
   split_path_recurse path 0 (String.length path);;

(* Join list of path components to a single string *)
let rec join_path_recurse part_list =
   match part_list
   with a::[] -> a
   | a::b -> a ^ path_separator_str ^ (join_path_recurse b)
   | [] -> "";;

let join_path part_list =
   join_path_recurse part_list;;

(* Canonicalize a single path string, removing . and .. components.
   List of path components to keep is maintained as a reversed list
   for better memory management.                                    *)
let rec canonicalize_recurse rev_keep_parts path_parts =
   match path_parts
   with "."::b -> (canonicalize_recurse rev_keep_parts b)
   | ".."::b ->
   (
      match rev_keep_parts
      with ""::y | ".."::y -> (canonicalize_recurse (".."::rev_keep_parts) b)
      | x::y -> canonicalize_recurse y b
      | [] -> (canonicalize_recurse [".."] b)
   )
   | a::b -> (canonicalize_recurse (a::rev_keep_parts) b)
   | [] -> List.rev rev_keep_parts;;

let canonicalize path =
   join_path (canonicalize_recurse [] (split_path path));;

(* Remove duplicate items in sorted list *)
let rec uniq_recurse last rest =
   match rest
   with a::b ->
      if a = last
      then uniq_recurse last b
      else last::(uniq_recurse a b)
   | [] -> [last];;

let uniq x =
   match x with a::b -> uniq_recurse a b
   | _ -> x;;

(* Remove empty items in list *)
let rec remove_empty_recurse string_list =
   match string_list
   with ""::b | "."::b | ".."::b -> remove_empty_recurse b
   | a::b -> a::(remove_empty_recurse b)
   | [] -> [];;

let remove_empty string_list =
   remove_empty_recurse string_list;;

(* Compute relative path from source to target *)
let rec prepend_relative_root path_parts depth =
   if depth = 0
   then path_parts
   else ".." :: (prepend_relative_root path_parts (depth - 1));;

let rec common_root source_parts target_parts =
   if List.hd source_parts = List.hd target_parts
   then common_root (List.tl source_parts) (List.tl target_parts)
   else prepend_relative_root target_parts ((List.length source_parts) - 1);;

let relative_path source target =
   let target_parts = split_path target in
   match target_parts
   with ""::b -> target
   | [] -> target
   | _ ->
      let source_parts = split_path source in
      match source_parts
      with ""::b -> target
      | [] -> target
      | _ ->
         join_path (common_root source_parts target_parts);;

(*-------------------------------------------------------- Collision tests *)

(* Number of header bytes to read *)
let header_length = 1024;;

(* Key type for header and content hash tables *)
type hash_key_type = Hash of Digest.t | NoHash;;

(* Get file size, return -1 if stat failed. *)
let file_size file =
   try (Unix.stat file).Unix.st_size
   with _ -> -1;;

(* Get MD5 of first part of file.  Throws exceptions. *)
let digest_header read_count file =
   let infile = open_in_bin file in
   let size = file_size file in
   let read_size =
      (if size > header_length then header_length else size) in
   let data = String.make read_size '.' in
   really_input infile data 0 read_size;
   let d = Digest.string (String.sub data 0 read_size) in
   close_in infile;
   read_count := Int64.add !read_count (Int64.of_int read_size);
   Hash d;;

(* Get MD5 for the whole file.  Throws exceptions. *)
let digest_all read_count file =
   read_count := Int64.add !read_count (Int64.of_int (file_size file));
   Hash (Digest.file file);;

(* Return hash table with a single value and unitialized key *)
let stub_content_hash file =
   let h = Hashtbl.create 1 in
   Hashtbl.add h NoHash file; h;;

let stub_header_hash file =
   let h = Hashtbl.create 1 in
   Hashtbl.add h NoHash (stub_content_hash file); h;;

(* Force lazy evaluation of hash keys: If a hash table has exactly one
   entry, it means the entry was previously added with only one value
   but no key computed (because we only compute the keys after seeing
   at least 2 entries).  This function replace the single hash table
   entry with the computed key.  Throws exceptions.                    *)
let evaluate_lazy_content_key content_hash read_count =
   if Hashtbl.length content_hash = 1 then
   try
      let file_name = Hashtbl.find content_hash NoHash in
      Hashtbl.remove content_hash NoHash;
      Hashtbl.add content_hash (digest_all read_count file_name) file_name
   with Not_found -> ();;

let evaluate_lazy_header_key header_hash read_count =
   if Hashtbl.length header_hash = 1 then
   try
      let content_hash = Hashtbl.find header_hash NoHash in
      let file_name = Hashtbl.find content_hash NoHash in
      Hashtbl.remove header_hash NoHash;
      Hashtbl.add header_hash
         (digest_header read_count file_name) (stub_content_hash file_name)
   with Not_found -> ();;

(* Check for content collision.  Return name of other file if found,
   otherwise update content_hash and return empty string.            *)
let check_content_collision content_hash read_count file =
   if Hashtbl.length content_hash = 0
   then (Hashtbl.add content_hash NoHash file; "")
   else
   (
      evaluate_lazy_content_key content_hash read_count;
      let d = digest_all read_count file in
      try Hashtbl.find content_hash d
      with Not_found -> Hashtbl.add content_hash d file; ""
   );;

(* Check for header->content collision.  Return name of other file if
   found, otherwise update header_hash and return empty string.       *)
let check_header_collision header_hash read_count file =
   if Hashtbl.length header_hash = 0
   then (Hashtbl.add header_hash NoHash (stub_content_hash file); "")
   else
   (
      evaluate_lazy_header_key header_hash read_count;
      let d = digest_header read_count file in
      try
         let c = Hashtbl.find header_hash d in
         check_content_collision c read_count file
      with Not_found ->
         Hashtbl.add header_hash d (stub_content_hash file); ""
   );;

(* Check for size->header->content collision, return name if found,
   otherwise update size_hash and return empty string.              *)
let check_size_collision size_hash read_count file size =
   try
      let h = Hashtbl.find size_hash size in
      check_header_collision h read_count file
   with Not_found ->
      (Hashtbl.add size_hash size (stub_header_hash file); "")
   | Sys_error msg ->
      print_string "# ";
      print_string msg;
      print_newline();
      "";;

(*--------------------------------------------------------- Input routines *)

(* Preprocess file list for duplicates and bad path names *)
let preprocess_list file_list =
   remove_empty (uniq
      (List.sort String.compare (List.rev_map canonicalize file_list)));;

(* Return all lines in stdin as a list *)
type line = String of string | Eof;;

let rec stdin_lines () =
   let x =
      try String (read_line())
      with End_of_file -> Eof in
   match x
   with String line -> line :: (stdin_lines())
   | Eof -> [];;

(* Return sorted list of files to process: use argv if there is at
   least one argument excluding command name, otherwise use lines from
   stdin.                                                              *)
let file_list () =
   preprocess_list
   (
      let len = Array.length Sys.argv in
         if len > 1
         then Array.to_list (Array.sub Sys.argv 1 (len - 1))
         else stdin_lines ()
   );;

(*------------------------------------------------------------------- main *)

(* Print link from one orig to output *)
let print_link orig output =
   print_string "ln -s -f '";
   print_string (relative_path output orig);
   print_string "' '";
   print_string output;
   print_string "'\n";;

(* Check a file for collision, print link if found *)
let process_file hash file_count dup_count
                 dup_bytes read_count total_count file =
   let size = file_size file in
   match size
   with -1 ->
   (
      print_string "# ";
      print_string file;
      print_string ": can not get file size\n"
   )
   | 0 ->
   (
      print_string "ln -s -f ";
      print_string empty_file;
      print_string " '";
      print_string file;
      print_string "'\n";
      incr file_count
   )
   | _ ->
      let orig = check_size_collision hash read_count file size in
      if orig <> ""
      then
      (
         incr dup_count;
         dup_bytes := Int64.add !dup_bytes (Int64.of_int size);
         print_link orig file
      );
      incr file_count;
      total_count := Int64.add !total_count (Int64.of_int size);;

(* Process all files *)
let all_files = file_list();;
let collision_table = Hashtbl.create (List.length all_files);;
let file_count = ref 0;;
let dup_count = ref 0;;
let dup_bytes = ref Int64.zero;;
let read_count = ref Int64.zero;;
let total_count = ref Int64.zero;;

List.iter
(
   fun x -> process_file
      collision_table
      file_count
      dup_count
      dup_bytes
      read_count
      total_count
      x
) all_files;;

(* Print stats *)
print_string "# ";;
print_int !file_count;;
print_string " files, ";;
print_string (Int64.to_string !read_count);;
print_char '/';;
print_string (Int64.to_string !total_count);;
print_string " bytes read\n";;
if !dup_count > 0
then
(
   print_string "# ";
   print_string (Int64.to_string !dup_bytes);
   print_string " bytes in ";
   print_int !dup_count;
   print_string " duplicate files\n"
)
else
(
   print_string "# No duplicates found\n"
);;