ExtLib OptParse (part 2)

Date Tags ocaml

I’ve already wrote something about OptParse last month. Today I discovered how to create a new option (that is not a string, int or bool) and validate it within the arg parser.

So suppose we want to write an application that can output both txt and html and we want the user to specify the format with command line option. One way would be to use a StdOpt.str_option - eventually with a default option - and to retrive it in application code with OptParse.Opt.get.

However this is not satisfactory as we are mixing the application code with command line parsing. A better way is to create a new type of option with Opt.value_option .

This is the concept :

type out_t = Txt | Html
module Options = struct
  open OptParse
  exception Format
  let out_option ?default ?(metavar = "<txt|html>") () =
    let corce = function
      |"txt" -> Txt
      |"html" -> Html
      | _ -> raise Format
    in
    let error _ s = Printf.sprintf "%s format not supported" s in
    Opt.value_option metavar default corce error

  let output = out_option ~default:Txt ()

  let description = "This is an example"
  let options = OptParser.make ~description:description ()

  open OptParser
  add options ~short_name:'o' ~long_name:"out" ~help:"Output type" output;
end

Note that the function Opt.value_option get a default value, a metavar - that is the sting associated with the option in the help ( -o<txt|html>, --out=<txt|html> Output type ), a corce function, that is, a function that transforms a string in the desired type, and an error function that is used by the parser to give a meaningful error is the option is not correctly validated.

For example :

$./test.native -oooo
usage: test.native [options]

test.native: option '-o': ooo format not supported

Now when we use this new option in the application code with OptParse.Opt.get and we can be certain that it was correctly validated.


subsets for reference

Date Tags ocaml

Every now and then I need to write a simple combinatorial algorithm. Using monads this is fairly easy and concise, but probably not the fastest way to do it. We start with the definition of a few functions in terms of the List module. The function themselves are kinda of self explanatory. I write this mostly for reference then for real added value.

let return a = [a]
let bind m f = List.flatten (List.map f m)
let mzero = []
let guard b = if b then return () else mzero
let mplus = List.append

let card l = (List.length l)

let rec subsets = function
  |[] -> return []
  |h :: t ->
      bind (subsets t) (fun t1 ->
        mplus (
          bind (return t1) (fun t2 -> return (h :: t2))
        ) (return t1)
      )

(* all subsets with cardinality less then k *)
(* [ x | x <- (subsets X) ; |x| <= k ] *)
let subsets_k k l =
  bind (subsets l) (fun x ->
    bind (guard (card(x) <= k)) (fun _ ->
      return x
    )
  )

(* cartesian product *)
let cartesian l1 l2 =
  bind l1 (fun x ->
    bind l2 (fun y ->
      return (x,y)
    )
  )

let rec permutation = function
  |[] -> return []
  |h::t ->
      bind (permutation t) (fun t1 ->
        List.map (fun h1 -> h1 :: t1) h
      )

The previous version of the code uses the List module. If we want a more space efficient implementation of the same functions, we can use a lazy data structure and substitute the functions in the preamble. In this case, instead of writing a lazy list module from scratch, we simply use the Enum module of ExtLib.

open ExtLib
let return a = let e = Enum.empty () in Enum.push e a ; e
let bind m f = Enum.concat (Enum.map f m)
let mzero = Enum.empty ()
let guard b = if b then return () else mzero
let mplus = Enum.append

In action :

# subsets_k 1 [1;2];;                      
- : int list list = [[2]; [1]; []]
# cartesian [1;2;3] [3;4];;
- : (int * int) list = [(1, 3); (1, 4); (2, 3); (2, 4); (3, 3); (3, 4)]
permutation [[1;2;3;4];[5;6];[7;8;9]];;
- : int list list =
[[1; 5; 7]; [2; 5; 7]; [3; 5; 7]; [4; 5; 7]; [1; 6; 7]; [2; 6; 7]; [3; 6; 7];
 [4; 6; 7]; [1; 5; 8]; [2; 5; 8]; [3; 5; 8]; [4; 5; 8]; [1; 6; 8]; [2; 6; 8];
 [3; 6; 8]; [4; 6; 8]; [1; 5; 9]; [2; 5; 9]; [3; 5; 9]; [4; 5; 9]; [1; 6; 9];
 [2; 6; 9]; [3; 6; 9]; [4; 6; 9]]

Finding maximal cliques (and independent sets) in an undirected graph. Bron–Kerbosch algorithm.

A small ocaml implementation of the Bron–Kerbosch algorithm to list all maximal cliques in an undirected graph. The description of the algorithm is on wikipedia. http://en.wikipedia.org/wiki/Bron–Kerbosch_algorithm . The example given is the same as in the wikipedia page.

open Graph

(*
The Bron–Kerbosch algorithm is an algorithm for finding maximal cliques in an undirected graph. 
http://en.wikipedia.org/wiki/Bron%E2%80%93Kerbosch_algorithm

   BronKerbosch1(R,P,X):
       if P and X are both empty:
           report R as a maximal clique
       for each vertex v in P:
           BronKerbosch1(R ⋃ {v}, P ⋂ N(v), X ⋂ N(v))
           P := P \ {v}
           X := X ⋃ {v}

   BronKerbosch2(R,P,X):
       if P and X are both empty:
           report R as a maximal clique
       choose a pivot vertex u in P ⋃ X
       for each vertex v in P \ N(u):
           BronKerbosch2(R ⋃ {v}, P ⋂ N(v), X ⋂ N(v))
           P := P \ {v}
           X := X ⋃ {v}
*)

module V = struct
  type t = int
  let compare = compare
  let hash = Hashtbl.hash
  let equal = (=)
end

module UG = Persistent.Graph.Concrete(V)
module N = Oper.Neighbourhood(UG)
module S = N.Vertex_Set

let rec bronKerbosch1 gr r p x =
  let n v = N.set_from_vertex gr v in
  if (S.is_empty p) && (S.is_empty x) then [r]
  else
    let (_,_,mxc) =
      S.fold (fun v (p,x,acc) ->
        let r' = S.union r (S.singleton v) in
        let p' = S.inter p (n v) in
        let x' = S.inter x (n v) in
        (S.remove v p, S.add v x, (bronKerbosch1 gr r' p' x') @ acc)
      ) p (p,x,[])
    in mxc

let rec bronKerbosch2 gr r p x =
  let n v = N.set_from_vertex gr v in
  if (S.is_empty p) && (S.is_empty x) then [r]
  else
    let u = S.choose (S.union p x) in
    let (_,_,mxc) =
      S.fold (fun v (p,x,acc) ->
        let r' = S.union r (S.singleton v) in
        let p' = S.inter p (n v) in
        let x' = S.inter x (n v) in
        (S.remove v p, S.add v x,(bronKerbosch2 gr r' p' x') @ acc)
      ) (S.diff p (n u)) (p,x,[])
    in mxc

let main () =
  let vl = [1;2;3;4;5;6] in
  let gr = List.fold_left (fun g v -> UG.add_vertex g v) UG.empty vl in
  let el = [(6,4);(4,3);(4,5);(5,2);(3,2);(5,1);(2,1)] in
  let gr = List.fold_left (fun g (x,y) -> UG.add_edge g x y) gr el in
  let r = S.empty in
  let p = List.fold_right S.add vl S.empty in
  let x = S.empty in
  print_endline "bronKerbosch1";
  let mxl = bronKerbosch1 gr r p x in
  List.iter (fun s ->
    Printf.printf "%s\n" (String.concat "," (List.map string_of_int (S.elements s)))
  ) mxl
  ;
  print_endline "bronKerbosch2";
  let mxl = bronKerbosch2 gr r p x in
  List.iter (fun s ->
    Printf.printf "%s\n" (String.concat "," (List.map string_of_int (S.elements s)))
  ) mxl
;;

main () ;;

To test it :

$ocamlfind ocamlc -linkpkg -package ocamlgraph cliques.ml
$./a.out 
bronKerbosch1
4,6
4,5
3,4
2,3
1,2,5
bronKerbosch2
4,6
4,5
3,4
2,3
1,2,5

There is also a bit of code here. Even if it might be more efficient, it looks to me exceptionally complicated for such a simple algorithm…

Update - Maximal Independent Sets

Maybe is also worth mentioning the Maximal independent set problem that is the dual of the maximal clique problem. In other words, the list of all maximal independent set of a graph is nothing else that the list of maximal cliques of the complement of the input graph.

module O = Oper.Make(Builder.I(UG))

let max_independent_sets gr =
  let cgr = O.complement gr in
  let r = S.empty in
  let p = UG.fold_vertex S.add cgr S.empty in
  let x = S.empty in
  bronKerbosch2 cgr r p x

ExtLib OptParse module. Options parsing made easy and clean

Date Tags ocaml

I recently discovered the extLib OptPase module [1] . It’s a very nice and complete replacement for the good old Arg in the standard library. I’m gonna give a small example on how to use it. I hope this can be useful to somebody.

I first build an Option module to clearly separate the options handling from the rest of my program. To keep it short we add only two options, debug and output. Debug has a short and long option, output is only a string. We also add two group options to spice up the example …

open ExtLib

module Options =
struct
    open OptParse
    let debug = StdOpt.store_true ()
    let out = StdOpt.str_option ()

    let options = OptParser.make ()

    open OptParser

    let g = add_group options ~description:"general options" "general" ;;
    let o = add_group options ~description:"output options" "output" ;;

    add options ~group:g ~short_name:'d' ~long_name:"debug" ~help:"Debug information" debug;
    add options ~group:o ~long_name:"out" ~help:"Send output to a file" out;
end

To actually parse the options we have a main function that invokes the parse_argv function, stores all the options in the respective variables in the module Options and return a list of string containing all the positional arguments given on the command line that are not parsed as options.

let main () =
  let posargs = OptParse.OptParser.parse_argv Options.options in

  if OptParse.Opt.get Options.debug then
     Printf.eprintf "enabling debug\n"
  ;

  (* dump all positional arguments *)
  let ch =
    if OptParse.Opt.is_set Options.out then
      open_out (OptParse.Opt.get Options.out)
    else stdout
  in

  List.iter (Printf.fprintf ch "%s\n") posargs
;;
main ()
#./test.native --help
usage: test.native [options]

options:

  -h, --help            show this help message and exit

  general:

    general options

    -d, --debug         Debug information

  output:

    output options

    --out=STR           Send output to a file

#./test.native -d one two three
one
two
three
enabling Debug
#

[1] http://ocaml-extlib.googlecode.com/svn/doc/apiref/OptParse.html


simple expat based xml parser

Date Tags ocaml

The other day I needed a small xml parser to convert an xml document into a different format. First I tried xml-light. This is a simple parser all written in ocaml that stores the parser xml document in an ocaml data structure. This data structure can be user to access various fields of the xml document. It does not offer a dom-like interface, but actually I consider this a feature. Unfortunately xml-light is terribly slow. To parse 30K-plus lines of xml it takes far too long to be considered for my application.

The next logic choice was to try Expat, that is a event-based parser and it is extremely fast. Since using an event based parser can be a bit cumbersome (and I already had written of bit of code using xml-light), I decided to write a small wrapper around expat to provide a xml-light interface to it.

The code is pretty simple and the main idea is taken from the cduce xml loader.

First we provide a small data structure to hold the xml document as we examine it. Nothing deep here. Notice that we use Start and String as we descend the tree and Element we we unwind the stack.

type stack =
  | Element of (Xml.xml * stack)
  | Start of (string * (string * string) list * stack)
  | String of (string * t * stack)
  | Empty
and t =
  | PCData
  | CData

Then we need to provide expat handlers to store xml fragments on the stack as we go down. Note that we have an handler for cdata, but not an handler for pcdata as it is the default.

let pcdata buff = (Xml.PCData buff)
let cdata buff = (Xml.CData buff)

let rec create_elt acc = function
  | String (s,CData, st) -> create_elt (L.push (cdata s) acc) st
  | String (s,PCData, st) -> create_elt (L.push (pcdata s) acc) st
  | Element (x,st) -> create_elt (L.push x acc) st
  | Start (tag,attrlst,st) -> stack := Element(Xml.Element(tag,attrlst,acc),st)
  | Empty -> assert false

let start_cdata_handler () = txt.cdata <- true ;;

let start_element_handler tag attrlst =
  if not (only_ws txt.buffer txt.pos) then begin
    let str = String.sub txt.buffer 0 txt.pos in
    if txt.cdata then
      stack := String (str, CData, !stack)
    else
      stack := String (str, PCData, !stack)
  end
  ;
  txt.pos <- 0;
  txt.cdata <- false;
  stack := Start (tag,attrlst,!stack)
;;

let end_element_handler _ =
  let acc =
    if only_ws txt.buffer txt.pos then L.empty
    else
      let str = String.sub txt.buffer 0 txt.pos in
      if txt.cdata then L.one (cdata str)
      else L.one (pcdata str)
  in
  txt.pos <- 0;
  txt.cdata <- false;
  create_elt acc !stack
;;

let character_data_handler = add_string txt ;;

At the end we just register all handlers with the expat parser and we return the root of the xml document.

let parse_string str =
  let p = Expat.parser_create None in
  Expat.set_start_element_handler p start_element_handler ;
  Expat.set_end_element_handler p end_element_handler ;
  Expat.set_start_cdata_handler p start_cdata_handler ;
  Expat.set_character_data_handler p character_data_handler ;
  ignore (Expat.set_param_entity_parsing p Expat.ALWAYS);
  Expat.parse p str;
  Expat.final p;
  match !stack with
  |Element (x,Empty) -> (stack := Empty; x)
  | _ -> assert false

I’ve copied the xml-light methods and to access the document in a different file. I’ve also made everything lazy to save a bit of computing time if it is only necessary to access a part of a huge xml document.

The complete code can be found here: git clone https://www.mancoosi.org/~abate/repos/xmlparser.git

UPDATE

The other that I was made aware that this parser has a serious bug when used on a 32 bit machine. The problem is that the maximal string size on a 32bit machine is equal to Sys.max_string_length that is roughly 16Mb . If we read and parse a bit document document at once with IO.read_all , we immediately get an exception. The solution is to parse the document incrementally using the new function parser_ch below that get a channel instead of a string and run the expat parser incrementally :

let parser_aux f =
  let p = Expat.parser_create None in
  Expat.set_start_element_handler p start_element_handler ;
  Expat.set_end_element_handler p end_element_handler ;

  Expat.set_start_cdata_handler p start_cdata_handler ;

  Expat.set_character_data_handler p character_data_handler ;
  ignore (Expat.set_param_entity_parsing p Expat.ALWAYS);
  f p;
  match !stack with
  |Element (x,Empty) -> (stack := Empty; x)
  | _ -> assert false

let parse_str str =
  let f p =
    Expat.parse p str;
    Expat.final p;
  in
  parser_aux f

let parse_ch ch =
  let f p =
    try while true do
      Expat.parse p (IO.nread ch 10240)
    done with IO.No_more_input -> Expat.final p ;
  in
  parser_aux f