list unique

Well ... it seems that planet ocaml is faster then light to index new pages... I often start editing one story to publish it few days later to avoid stupid mistakes. This time I published the page by accident and I think it stayed published for less then two minutes... This is the final versions ...

Today i did a small audit on my code to check which are the functions that are often used and can slow down my code. One in particular took my attention, ExtLib.List.unique . This function (below) takes quadratic time on the length of the input list. On big lists, this function is a killer. The algorithm is very simple. Since it accepts a cmp optional argument, it can be much faster if we use a monomorphic comparing function.

let rec unique ?(cmp = ( = )) l =
        let rec loop dst = function
                | [] -> ()
                | h :: t ->
                        match exists (cmp h) t with
                        | true -> loop dst t
                        | false ->
                                let r = { hd =  h; tl = [] }  in
                                dst.tl <- inj r;
                                loop r t
        in
        let dummy = dummy_node() in
        loop dummy l;
        dummy.tl

This is a small benchmark (benchmarking is addictive indeed !!) using, the polymorphic and monomorphic variant of List.unique :

open ExtLib ;;

Random.self_init ();;

let list_unique l = List.unique l ;;

let list_unique_mono l =
  let cmp (x : string) (y : string) = x = y in
  List.unique ~cmp l
;;

let run () =
  let rec gen_strings acc = function
    |0 -> acc
    |n -> gen_strings ((string_of_int(Random.int 100))::acc) (n-1)
  in
  let a = gen_strings [] 100000 in
  Benchmark.latencyN (Int64.of_int 10) [
    ("list_unique",list_unique,a);
    ("list_unique_mono",list_unique_mono,a);
    ("hash_unique",hash_unique,a);
  ]
;;
run ();;

However if you want to go even faster, you can use a stupid implementation based on hash tables.

let hash_unique l =
  let h = Hashtbl.create (List.length l) in
  let add n =
    if not(Hashtbl.mem h n) then
      Hashtbl.add h n ()
  in
  List.iter add l;
  Hashtbl.fold (fun k _ acc -> k::acc) h []

The results are quite clear...

Latencies for 10 iterations of "list_unique", "list_unique_mono", "hash_unique":
     list_unique:  5.34 WALL ( 5.24 usr +  0.06 sys =  5.30 CPU) @  1.89/s (n=10)
     list_unique_mono:  1.41 WALL ( 1.31 usr +  0.08 sys =  1.39 CPU) @  7.18/s (n=10)
     hash_unique:  0.21 WALL ( 0.18 usr +  0.03 sys =  0.21 CPU) @ 48.07/s (n=10)

In this test with a long list and many repetition, the difference is remarkable. the function hash_unique is not stable, but if you don't care, it does the job pretty well. If you want an even faster implementation based on list that is also not stable, you can write a small function that remove duplicates on a sorted list.

Average: 1.3 (9 votes)

Comments

BatList.sort_unique

is the unstable version in Batteries.

15 % improvement

Here is a slightly improved version (hash_unique2) that goes through the list only once:

let hash_unique l =
  let h = Hashtbl.create (List.length l) in
  let add n =
    if not(Hashtbl.mem h n) then
      Hashtbl.add h n ()
  in
  List.iter add l;
  Hashtbl.fold (fun k _ acc -> k::acc) h []

let hash_unique2 l =
  let h = Hashtbl.create (List.length l) in
  let rec add acc =
    function
      | hd :: tl when not (Hashtbl.mem h hd) ->
          Hashtbl.add h hd ();
          add (hd :: acc) tl
      | _ :: tl ->
          add acc tl
      | [] ->
          acc
  in
    add [] l

module SetString = Set.Make(String)

let set_unique l =
  let rec add st acc =
    function
      | hd :: tl when not (SetString.mem hd st) ->
          add (SetString.add hd st) (hd :: acc) tl
      | _ :: tl ->
          add st acc tl
      | [] ->
          acc
  in
    add SetString.empty [] l

let run () =
  let rec gen_strings acc = function
    |0 -> acc
    |n -> gen_strings ((string_of_int(Random.int 100))::acc) (n-1)
  in
  let a = gen_strings [] 100000
  in
  let _ =
    Benchmark.latencyN (Int64.of_int 100) [
      ("hash_unique",hash_unique,a);
      ("hash_unique2",hash_unique2,a);
      ("set_unique", set_unique,a);
    ]
  in
  let lst1 = hash_unique a in
  let lst2 = hash_unique2 a in
    Printf.eprintf "l1: %d; l2: %d;\n%!" (List.length lst1) (List.length lst2);
    assert(List.length lst1 = List.length lst2)
;;
run ();;

And the result:

ocamlfind ocamlopt -package benchmark -linkpkg -o test test.ml
./test
Latencies for 100 iterations of "hash_unique", "hash_unique2", "set_unique":
 hash_unique:  1.05 WALL ( 1.04 usr +  0.01 sys =  1.05 CPU) @ 95.41/s (n=100)
hash_unique2:  0.93 WALL ( 0.92 usr +  0.00 sys =  0.92 CPU) @ 108.22/s (n=100)
              (warning: too few iterations for a reliable count)
  set_unique:  2.72 WALL ( 2.72 usr +  0.00 sys =  2.72 CPU) @ 36.71/s (n=100)
l1: 100; l2: 100;

even faster ...

If you specialize the hashtbl...

 
module OCAMLHashtbl = Hashtbl

open ExtLib ;;

[...]

let hash_unique2_mono l =
  let module StringHash = OCAMLHashtbl.Make(
    struct
      type t = string
      let equal (x : string) (y : string) = x = y
      let hash t = Hashtbl.hash t
    end)
  in
  let h = StringHash.create (List.length l) in
  let rec add acc =
    function
      | hd :: tl when not (StringHash.mem h hd) ->
          StringHash.add h hd ();
          add (hd :: acc) tl
      | _ :: tl ->
          add acc tl
      | [] ->
          acc
  in
  add [] l