(* natsume.ml - Don Yang (uguu.org) Ported from natsume0.pl, go there for more comments. Because stat is not in the standard libraries, unix.cma or unix.cmxa is needed for the program to work: ocaml unix.cma natsume.ml ocamlc unix.cma natsume.ml -o natsume ocamlopt unix.cxma natsume.ml -o natsume Doesn't work for files larger than 2GB. 12/26/05 *) (*----------------------------------------------- Path management routines *) (* Directory component separator *) let path_separator = '/';; let path_separator_str = String.make 1 path_separator;; (* Empty file *) let empty_file = "/dev/null";; (* Split string to list of components. String.index could have been used instead, but the library implementation is very similar to what's here with unnecessary exception bits. *) let rec split_path_recurse path index limit = if index >= limit - 1 then [path] else if String.get path index = path_separator then let remaining = limit - index - 1 in (String.sub path 0 index) :: (split_path_recurse (String.sub path (index + 1) remaining) 0 remaining) else split_path_recurse path (index + 1) limit;; let split_path path = split_path_recurse path 0 (String.length path);; (* Join list of path components to a single string *) let rec join_path_recurse part_list = match part_list with a::[] -> a | a::b -> a ^ path_separator_str ^ (join_path_recurse b) | [] -> "";; let join_path part_list = join_path_recurse part_list;; (* Canonicalize a single path string, removing . and .. components. List of path components to keep is maintained as a reversed list for better memory management. *) let rec canonicalize_recurse rev_keep_parts path_parts = match path_parts with "."::b -> (canonicalize_recurse rev_keep_parts b) | ".."::b -> ( match rev_keep_parts with ""::y | ".."::y -> (canonicalize_recurse (".."::rev_keep_parts) b) | x::y -> canonicalize_recurse y b | [] -> (canonicalize_recurse [".."] b) ) | a::b -> (canonicalize_recurse (a::rev_keep_parts) b) | [] -> List.rev rev_keep_parts;; let canonicalize path = join_path (canonicalize_recurse [] (split_path path));; (* Remove duplicate items in sorted list *) let rec uniq_recurse last rest = match rest with a::b -> if a = last then uniq_recurse last b else last::(uniq_recurse a b) | [] -> [last];; let uniq x = match x with a::b -> uniq_recurse a b | _ -> x;; (* Remove empty items in list *) let rec remove_empty_recurse string_list = match string_list with ""::b | "."::b | ".."::b -> remove_empty_recurse b | a::b -> a::(remove_empty_recurse b) | [] -> [];; let remove_empty string_list = remove_empty_recurse string_list;; (* Compute relative path from source to target *) let rec prepend_relative_root path_parts depth = if depth = 0 then path_parts else ".." :: (prepend_relative_root path_parts (depth - 1));; let rec common_root source_parts target_parts = if List.hd source_parts = List.hd target_parts then common_root (List.tl source_parts) (List.tl target_parts) else prepend_relative_root target_parts ((List.length source_parts) - 1);; let relative_path source target = let target_parts = split_path target in match target_parts with ""::b -> target | [] -> target | _ -> let source_parts = split_path source in match source_parts with ""::b -> target | [] -> target | _ -> join_path (common_root source_parts target_parts);; (*-------------------------------------------------------- Collision tests *) (* Number of header bytes to read *) let header_length = 1024;; (* Key type for header and content hash tables *) type hash_key_type = Hash of Digest.t | NoHash;; (* Get file size, return -1 if stat failed. *) let file_size file = try (Unix.stat file).Unix.st_size with _ -> -1;; (* Get MD5 of first part of file. Throws exceptions. *) let digest_header read_count file = let infile = open_in_bin file in let size = file_size file in let read_size = (if size > header_length then header_length else size) in let data = String.make read_size '.' in really_input infile data 0 read_size; let d = Digest.string (String.sub data 0 read_size) in close_in infile; read_count := Int64.add !read_count (Int64.of_int read_size); Hash d;; (* Get MD5 for the whole file. Throws exceptions. *) let digest_all read_count file = read_count := Int64.add !read_count (Int64.of_int (file_size file)); Hash (Digest.file file);; (* Return hash table with a single value and unitialized key *) let stub_content_hash file = let h = Hashtbl.create 1 in Hashtbl.add h NoHash file; h;; let stub_header_hash file = let h = Hashtbl.create 1 in Hashtbl.add h NoHash (stub_content_hash file); h;; (* Force lazy evaluation of hash keys: If a hash table has exactly one entry, it means the entry was previously added with only one value but no key computed (because we only compute the keys after seeing at least 2 entries). This function replace the single hash table entry with the computed key. Throws exceptions. *) let evaluate_lazy_content_key content_hash read_count = if Hashtbl.length content_hash = 1 then try let file_name = Hashtbl.find content_hash NoHash in Hashtbl.remove content_hash NoHash; Hashtbl.add content_hash (digest_all read_count file_name) file_name with Not_found -> ();; let evaluate_lazy_header_key header_hash read_count = if Hashtbl.length header_hash = 1 then try let content_hash = Hashtbl.find header_hash NoHash in let file_name = Hashtbl.find content_hash NoHash in Hashtbl.remove header_hash NoHash; Hashtbl.add header_hash (digest_header read_count file_name) (stub_content_hash file_name) with Not_found -> ();; (* Check for content collision. Return name of other file if found, otherwise update content_hash and return empty string. *) let check_content_collision content_hash read_count file = if Hashtbl.length content_hash = 0 then (Hashtbl.add content_hash NoHash file; "") else ( evaluate_lazy_content_key content_hash read_count; let d = digest_all read_count file in try Hashtbl.find content_hash d with Not_found -> Hashtbl.add content_hash d file; "" );; (* Check for header->content collision. Return name of other file if found, otherwise update header_hash and return empty string. *) let check_header_collision header_hash read_count file = if Hashtbl.length header_hash = 0 then (Hashtbl.add header_hash NoHash (stub_content_hash file); "") else ( evaluate_lazy_header_key header_hash read_count; let d = digest_header read_count file in try let c = Hashtbl.find header_hash d in check_content_collision c read_count file with Not_found -> Hashtbl.add header_hash d (stub_content_hash file); "" );; (* Check for size->header->content collision, return name if found, otherwise update size_hash and return empty string. *) let check_size_collision size_hash read_count file size = try let h = Hashtbl.find size_hash size in check_header_collision h read_count file with Not_found -> (Hashtbl.add size_hash size (stub_header_hash file); "") | Sys_error msg -> print_string "# "; print_string msg; print_newline(); "";; (*--------------------------------------------------------- Input routines *) (* Preprocess file list for duplicates and bad path names *) let preprocess_list file_list = remove_empty (uniq (List.sort String.compare (List.rev_map canonicalize file_list)));; (* Return all lines in stdin as a list *) type line = String of string | Eof;; let rec stdin_lines () = let x = try String (read_line()) with End_of_file -> Eof in match x with String line -> line :: (stdin_lines()) | Eof -> [];; (* Return sorted list of files to process: use argv if there is at least one argument excluding command name, otherwise use lines from stdin. *) let file_list () = preprocess_list ( let len = Array.length Sys.argv in if len > 1 then Array.to_list (Array.sub Sys.argv 1 (len - 1)) else stdin_lines () );; (*------------------------------------------------------------------- main *) (* Print link from one orig to output *) let print_link orig output = print_string "ln -s -f '"; print_string (relative_path output orig); print_string "' '"; print_string output; print_string "'\n";; (* Check a file for collision, print link if found *) let process_file hash file_count dup_count dup_bytes read_count total_count file = let size = file_size file in match size with -1 -> ( print_string "# "; print_string file; print_string ": can not get file size\n" ) | 0 -> ( print_string "ln -s -f "; print_string empty_file; print_string " '"; print_string file; print_string "'\n"; incr file_count ) | _ -> let orig = check_size_collision hash read_count file size in if orig <> "" then ( incr dup_count; dup_bytes := Int64.add !dup_bytes (Int64.of_int size); print_link orig file ); incr file_count; total_count := Int64.add !total_count (Int64.of_int size);; (* Process all files *) let all_files = file_list();; let collision_table = Hashtbl.create (List.length all_files);; let file_count = ref 0;; let dup_count = ref 0;; let dup_bytes = ref Int64.zero;; let read_count = ref Int64.zero;; let total_count = ref Int64.zero;; List.iter ( fun x -> process_file collision_table file_count dup_count dup_bytes read_count total_count x ) all_files;; (* Print stats *) print_string "# ";; print_int !file_count;; print_string " files, ";; print_string (Int64.to_string !read_count);; print_char '/';; print_string (Int64.to_string !total_count);; print_string " bytes read\n";; if !dup_count > 0 then ( print_string "# "; print_string (Int64.to_string !dup_bytes); print_string " bytes in "; print_int !dup_count; print_string " duplicate files\n" ) else ( print_string "# No duplicates found\n" );;