Tail recursive string recognizers

A recent discussion at work about the best way to write a simple string checking function set me thinking…

Essentially, the problem is to recognize strings conforming to some regular expression, but with some extra conditions, that either can’t be expressed as a regexp or that are inconvenient to do so, so we can’t just plug it in to PCRE or whatever RE library is flavour of the month.

In this case, we wanted strings conforming to:

[0-9a-zA-Z*][-0-9a-zA-Z*]*(.[0-9a-zA-Z*][-0-9a-zA-Z*]*)*

ie. a path containing “.” separated components, possibly with “*” wildcards, but also with the condition that each component has at most one wildcard and with a limit on the length of each component.

Ignoring the extra conditions, we can define a suitable regular grammar:

S = S0
S0 = ALNUM + S1 | "*" + S1
S1 = ALNUM + S1 | "-" + S1 | "*" + S1 | "." + S0 | ""

and this naturally suggests a recursive descent recognizer, though in this case, there is no actual descent as the grammar is regular and all the recursions are tail calls.

Happily, these days compilers like GCC will compile tail calls to jumps (at least on higher optimization settings), just like ML compilers, for example, have always had to, and we can hope for reasonably good code from such an approach. Of course, once the tail calls have been optimized, we have a finite state recognizer with the state transitions implemented as gotos!

The extra conditions can now be easily added in to the state transition functions, with any variables needed just appearing as extra parameters (and because we need to specify the parameter value for every recursive call/state transition, we are less likely to forget to set something) so we end up with something like:

#include <ctype.h>

static inline bool check(const char *p);

static bool check1(const char *p, int n, bool seen) {
  if (n > 4) return false;
  if (isalnum(*p)) return check1(p+1, n+1, seen);
  if (*p == '-') return check1(p+1, n+1, seen);
  if (*p == '*') return !seen && check1(p+1,n+1,true);
  if (*p == '.') return check(p+1);
  if (*p == '\0') return true;
  return false;
}

static bool check(const char *p) {
  if (isalnum(*p)) return check1(p+1,1,false);
  if (*p == '*') return check1(p+1,1,true);
  return false;
}

Compiling with 64 bit gcc 4.7.2, -O2 (gcc only optimizes tail calls at -O2 and above), we get:

_Z5checkPKc:
	movq	%rbp, -8(%rsp)
	movq	%rbx, -16(%rsp)
	subq	$24, %rsp
	movzbl	(%rdi), %ebx
	movq	%rdi, %rbp
	call	__ctype_b_loc
	movq	(%rax), %rax
	movsbq	%bl, %rdx
	testb	$8, (%rax,%rdx,2)
	jne	.L7
	cmpb	$42, %bl
	je	.L8
	xorl	%eax, %eax
	movq	8(%rsp), %rbx
	movq	16(%rsp), %rbp
	addq	$24, %rsp
	ret
.L7:
	leaq	1(%rbp), %rdi
	xorl	%edx, %edx
.L5:
	movq	8(%rsp), %rbx
	movq	16(%rsp), %rbp
	movl	$1, %esi
	addq	$24, %rsp
	jmp	_ZL6check1PKcib
.L8:
	leaq	1(%rbp), %rdi
	movl	$1, %edx
	jmp	.L5
_ZL6check1PKcib:
	movq	%rbp, -24(%rsp)
	movq	%r12, -16(%rsp)
	movq	%rdi, %rbp
	movq	%r13, -8(%rsp)
	movq	%rbx, -32(%rsp)
	subq	$40, %rsp
	movzbl	(%rdi), %ebx
	movl	%esi, %r13d
	movl	%edx, %r12d
	call	__ctype_b_loc
	movq	(%rax), %rax
	movsbq	%bl, %rcx
	testb	$8, (%rax,%rcx,2)
	jne	.L23
	cmpb	$45, %bl
	je	.L23
	cmpb	$42, %bl
	je	.L24
	testb	%bl, %bl
	sete	%al
	cmpb	$46, %bl
	je	.L25
.L12:
	movq	8(%rsp), %rbx
	movq	16(%rsp), %rbp
	movq	24(%rsp), %r12
	movq	32(%rsp), %r13
	addq	$40, %rsp
	ret
.L23:
	leal	1(%r13), %esi
	cmpl	$4, %esi
	jg	.L14
	leaq	1(%rbp), %rdi
	movzbl	%r12b, %edx
.L21:
	movq	8(%rsp), %rbx
	movq	16(%rsp), %rbp
	movq	24(%rsp), %r12
	movq	32(%rsp), %r13
	addq	$40, %rsp
	jmp	_ZL6check1PKcib
.L24:
	xorl	%eax, %eax
	testb	%r12b, %r12b
	jne	.L12
	leal	1(%r13), %esi
	cmpl	$4, %esi
	jg	.L12
	leaq	1(%rbp), %rdi
	movl	$1, %edx
	jmp	.L21
.L14:
	xorl	%eax, %eax
	jmp	.L12
.L25:
	leaq	1(%rbp), %rdi
	movq	8(%rsp), %rbx
	movq	16(%rsp), %rbp
	movq	24(%rsp), %r12
	movq	32(%rsp), %r13
	addq	$40, %rsp
	jmp	_Z5checkPKc

It looks like the tail calls have indeed been compiled as jumps, so the stack won’t blow up. There is what looks like some unnecessary movement of function arguments between the stack and the function argument registers, but we’re not doing too badly. Probably not quite as fast as a hand-coded goto-based version, but only a little slower, almost certainly a price worth paying for the gain in clarity (in fact, most of the time seems to be spent in the calls to isalnum).