Tail recursive string recognizers

A recent discussion at work about the best way to write a simple string checking function set me thinking…

Essentially, the problem is to recognize strings conforming to some regular expression, but with some extra conditions, that either can’t be expressed as a regexp or that are inconvenient to do so, so we can’t just plug it in to PCRE or whatever RE library is flavour of the month.

In this case, we wanted strings conforming to:

[0-9a-zA-Z*][-0-9a-zA-Z*]*(.[0-9a-zA-Z*][-0-9a-zA-Z*]*)*

ie. a path containing “.” separated components, possibly with “*” wildcards, but also with the condition that each component has at most one wildcard and with a limit on the length of each component.

Ignoring the extra conditions, we can define a suitable regular grammar:

S = S0
S0 = ALNUM + S1 | "*" + S1
S1 = ALNUM + S1 | "-" + S1 | "*" + S1 | "." + S0 | ""

and this naturally suggests a recursive descent recognizer, though in this case, there is no actual descent as the grammar is regular and all the recursions are tail calls.

Happily, these days compilers like GCC will compile tail calls to jumps (at least on higher optimization settings), just like ML compilers, for example, have always had to, and we can hope for reasonably good code from such an approach. Of course, once the tail calls have been optimized, we have a finite state recognizer with the state transitions implemented as gotos!

The extra conditions can now be easily added in to the state transition functions, with any variables needed just appearing as extra parameters (and because we need to specify the parameter value for every recursive call/state transition, we are less likely to forget to set something) so we end up with something like:

#include <ctype.h>

static inline bool check(const char *p);

static bool check1(const char *p, int n, bool seen) {
  if (n > 4) return false;
  if (isalnum(*p)) return check1(p+1, n+1, seen);
  if (*p == '-') return check1(p+1, n+1, seen);
  if (*p == '*') return !seen && check1(p+1,n+1,true);
  if (*p == '.') return check(p+1);
  if (*p == '\0') return true;
  return false;
}

static bool check(const char *p) {
  if (isalnum(*p)) return check1(p+1,1,false);
  if (*p == '*') return check1(p+1,1,true);
  return false;
}

Compiling with 64 bit gcc 4.7.2, -O2 (gcc only optimizes tail calls at -O2 and above), we get:

_Z5checkPKc:
    movq    %rbp, -8(%rsp)
    movq    %rbx, -16(%rsp)
    subq    $24, %rsp
    movzbl  (%rdi), %ebx
    movq    %rdi, %rbp
    call    __ctype_b_loc
    movq    (%rax), %rax
    movsbq  %bl, %rdx
    testb   $8, (%rax,%rdx,2)
    jne .L7
    cmpb    $42, %bl
    je  .L8
    xorl    %eax, %eax
    movq    8(%rsp), %rbx
    movq    16(%rsp), %rbp
    addq    $24, %rsp
    ret
.L7:
    leaq    1(%rbp), %rdi
    xorl    %edx, %edx
.L5:
    movq    8(%rsp), %rbx
    movq    16(%rsp), %rbp
    movl    $1, %esi
    addq    $24, %rsp
    jmp _ZL6check1PKcib
.L8:
    leaq    1(%rbp), %rdi
    movl    $1, %edx
    jmp .L5
_ZL6check1PKcib:
    movq    %rbp, -24(%rsp)
    movq    %r12, -16(%rsp)
    movq    %rdi, %rbp
    movq    %r13, -8(%rsp)
    movq    %rbx, -32(%rsp)
    subq    $40, %rsp
    movzbl  (%rdi), %ebx
    movl    %esi, %r13d
    movl    %edx, %r12d
    call    __ctype_b_loc
    movq    (%rax), %rax
    movsbq  %bl, %rcx
    testb   $8, (%rax,%rcx,2)
    jne .L23
    cmpb    $45, %bl
    je  .L23
    cmpb    $42, %bl
    je  .L24
    testb   %bl, %bl
    sete    %al
    cmpb    $46, %bl
    je  .L25
.L12:
    movq    8(%rsp), %rbx
    movq    16(%rsp), %rbp
    movq    24(%rsp), %r12
    movq    32(%rsp), %r13
    addq    $40, %rsp
    ret
.L23:
    leal    1(%r13), %esi
    cmpl    $4, %esi
    jg  .L14
    leaq    1(%rbp), %rdi
    movzbl  %r12b, %edx
.L21:
    movq    8(%rsp), %rbx
    movq    16(%rsp), %rbp
    movq    24(%rsp), %r12
    movq    32(%rsp), %r13
    addq    $40, %rsp
    jmp _ZL6check1PKcib
.L24:
    xorl    %eax, %eax
    testb   %r12b, %r12b
    jne .L12
    leal    1(%r13), %esi
    cmpl    $4, %esi
    jg  .L12
    leaq    1(%rbp), %rdi
    movl    $1, %edx
    jmp .L21
.L14:
    xorl    %eax, %eax
    jmp .L12
.L25:
    leaq    1(%rbp), %rdi
    movq    8(%rsp), %rbx
    movq    16(%rsp), %rbp
    movq    24(%rsp), %r12
    movq    32(%rsp), %r13
    addq    $40, %rsp
    jmp _Z5checkPKc

It looks like the tail calls have indeed been compiled as jumps, so the stack won’t blow up. There is what looks like some unnecessary movement of function arguments between the stack and the function argument registers, but we’re not doing too badly. Probably not quite as fast as a hand-coded goto-based version, but only a little slower, almost certainly a price worth paying for the gain in clarity (in fact, most of the time seems to be spent in the calls to isalnum).