I have a weird string syntax where the meaning of a
delimiter depends on context. In the following sample
input:
( (foo) (bar) )
the result is a list of two strings ["foo"; "bar"].
The outer pair of parenthesis enters list mode.
Then, the next pair of parentheses delimits the string.
Inside strings, balanced pairs of parentheses are to be
treated as part of the string.
Right now the lexer decides what to return depending
on a global variable inside.
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
The delimiters are parentheses. If the lexer hits an
opening parenthesis, then
- if
insideis false, it emits an
Entertoken andinsideis set to true. - If
insideis true, it switches to a string lexer
which treats any properly nested pair of parentheses
as part of the string. If the nesting level returns to
zero, the string buffer is passed to the parser.
If a closing parenthesis is encountered outside a string,
a Leave token is emitted and inside is unset.
My question is: How do I rewrite the lexer without
the global variable inside?
Fwiw I use menhir but afaict the same would be true for
ocamlyacc.
(Sorry if this sounds confused, I’m really a newbie to
the yacc/lex approach.
I can express all the above without thinking as a PEG but I
haven’t got used to mentally keeping lexer and parser
separated.
Feel free to point out other issues with the code!)
Simple example: *sample_lexer.mll*
{
open Sample_parser
exception Error of string
let inside = ref false (* <= to be eliminated *)
}
let lpar = "("
let rpar = ")"
let ws = [' ' '\t' '\n' '\r']
rule tokenize = parse
| ws { tokenize lexbuf }
| lpar { if not !inside then begin
inside := true;
Enter
end else begin
let buf = Buffer.create 20 in
String (string_scanner
(Lexing.lexeme_start lexbuf)
0
buf
lexbuf)
end }
| rpar { inside := false; Leave }
and string_scanner init depth buf = parse
| rpar { if depth = 0 then begin
Buffer.contents buf;
end else begin
Buffer.add_char buf ')';
string_scanner init (depth - 1) buf lexbuf end }
| lpar { Buffer.add_char buf '(';
string_scanner init (depth + 1) buf lexbuf }
| eof { raise (Error (Printf.sprintf
"Unexpected end of file inside string, pos %d--%d]!\n"
init
(Lexing.lexeme_start lexbuf))) }
| _ as chr { Buffer.add_char buf chr;
string_scanner init depth buf lexbuf }
*sample_scanner.mly*:
%token <string> String
%token Enter
%token Leave
%start <string list> process
%%
process:
| Enter lst = string_list Leave { lst }
string_list:
| elm = element lst = string_list { elm :: lst }
| elm = element { [elm] }
element:
| str = String { str }
main.ml:
open Batteries
let sample_input = "( (foo (bar) baz) (xyzzy) )"
(* EibssssssssssssseibssssseiL
* where E := enter inner
* L := leave inner
* i := ignore (whitespace)
* b := begin string
* e := end string
* s := part of string
*
* desired result: [ "foo (bar) baz"; "xyzzy" ] (type string list)
*)
let main () =
let buf = Lexing.from_string sample_input in
try
List.print
String.print stdout
(Sample_parser.process Sample_lexer.tokenize buf);
print_string "\n";
with
| Sample_lexer.Error msg -> Printf.eprintf "%s%!" msg
| Sample_parser.Error -> Printf.eprintf
"Invalid syntax at pos %d.\n%!"
(Lexing.lexeme_start buf)
let _ = main ()
You can pass the state as an argument to
tokenize. It still has to be mutable, but not global.rule tokenize inside = parse | ws { tokenize inside lexbuf } | lpar { if not !inside then begin inside := true; Enter end else begin let buf = Buffer.create 20 in String (string_scanner (Lexing.lexeme_start lexbuf) 0 buf lexbuf) end } | rpar { inside := false; Leave }And you call the parser as follows: