(* strutil.ml - Don Yang (uguu.org) 10/06/07 *) (* Get substring starting at offset *) let string_tail str offset = let strlen = String.length str in try String.sub str offset (strlen - offset) with Invalid_argument _ -> "";; (* Remove leading, trailing, and consecutive whitespaces from string. See remove_whitespace.dot for details. *) type clean_string_state = Init | WriteSpace | WriteChar;; let clean_string str = if str = "" then "" else let strlen = String.length str in let return_str = String.create strlen in let state = ref Init in let last_char = ref (-1) in let dst = ref 0 in let write_char c = (return_str.[!dst] <- c; last_char := !dst; incr dst) in let write_space () = (return_str.[!dst] <- ' '; incr dst) in (* Remove leading and duplicate whitespaces *) for src = 0 to (strlen - 1) do let c = str.[src] in match !state with Init -> ( if c = ' ' then () else (write_char c; state := WriteChar) ) | WriteChar -> ( if c = ' ' then (write_space (); state := WriteSpace) else write_char c ) | WriteSpace -> ( if c = ' ' then state := Init else (write_char c; state := WriteChar) ) done; (* Drop trailing spaces and return *) String.sub return_str 0 (!last_char + 1);; (* Get first word in string *) let first_word str skip = try ( let suffix = string_tail str skip in try String.sub suffix 0 (String.index suffix ' ') with Not_found -> suffix ) with Invalid_argument _ -> "";; (* Split space delimited string to list of tokens *) let rec split_string str = try let i = String.index str ' ' in [String.sub str 0 i] @ (split_string (string_tail str (i + 1))) with Not_found -> [str];; (* Check if a string contains the specified prefix at word boundaries *) let match_prefix_words str prefix = let prefix_length = String.length prefix in if (String.length str) = prefix_length then ( str = prefix ) else ( try (String.sub str 0 prefix_length) = prefix && (String.get str prefix_length) = ' ' with Invalid_argument _ -> false );; (* Get list of prefixes from a single string, longest prefix first *) let get_prefix_list str = let rec get_prefix_list_recurse reverse_token_list = match reverse_token_list with | a::[] -> [a] | b::a -> let sublist = get_prefix_list_recurse a in ((List.hd sublist) ^ " " ^ b) :: sublist | [] -> [] in get_prefix_list_recurse (List.rev (split_string str));; (* Process hash table keys and return a list of matching prefixes, throws exception if no prefix matches. *) let rec check_table_key key value (prefix_list, init) = if init then ( if (match_prefix_words key (List.hd prefix_list)) then (prefix_list, true) else check_table_key key value (List.tl prefix_list, true) ) else ( check_table_key key value (get_prefix_list key, true) );; (* Find longest prefix in a hash table keys *) let longest_prefix table = try let matched_prefix_list = Hashtbl.fold check_table_key table ([], false) in List.hd (fst matched_prefix_list) with Failure "tl" | Failure "hd" -> "";;