14 December 2009

Parsing a genetic code using flex and bison. Part 1/2

In this post I'll show how to write a lexer and a parser using Flex and Bison for parsing the genetic codes at NCBI. I'll write a non-reentrant parser that is to say that the generated code contains some global variables and cannot be safely executed concurrently. The input file is an ASN1 file that looks like this:
--*************************************************************************
-- This is the NCBI genetic code table
-- (...)
--**************************************************************************
Genetic-code-table ::= {
{
name "Standard" ,
name "SGC0" ,
id 1 ,
ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "---M---------------M---------------M----------------------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
},
{
(...)
} ,
{
name "Thraustochytrium Mitochondrial" ,
id 23 ,
ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "--------------------------------M--M---------------M------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
}
}
Each genetic code will be stored as a linked list of C structures called GeneticCode:
file gc.h:
#ifndef GENETIC_CODE_H
#define GENETIC_CODE_H
typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;
#endif
.The LEXER generated by flex converts the input into a sequence of tokens, and the parser generated by bison converts a grammar description into a C program.

Your browser does not support the <CANVAS> element !



The Parser

The grammar of Bison says that the lexical context might carry a string (the code ID) , a string (ncbieaa,ncbieaa...) or a GeneticCode:
%union {
char* s;
int integer;
GeneticCode gCode;
}
(...)
%token<integer> INTEGER
%token<s> TEXT
A genetic code is a series of tokens consisting in one or two names, the id, the ncbiaa and the sncbiaa strings. Every time this context is found, a new GeneticCode is allocated and its fields are filled:

%type<gCode> code
(...)
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
An optional name is 'nothing' or the following tokens: 'NAME TEXT COMMA'
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}

A series of genetic codes is one genetic code, or a series of genetic codes followed by one genetic code:
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
At last, the input constists in the tokens 'GENETIC_CODE_TABLE ASSIGN OPEN' and 'CLOSE' , surrounding a series of genetic codes. Each time, an input has been parsed, the linked list is scanned so each genetic code is printed to stdout.
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};

The Lexer

The lexers breaks the input into tokens. In this lexer, comments are ignored:
\-\-.*\n ;/* ignore comments */
... keywords are returned
Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
... and the tokens carrying a context (integers, strings...) are decoded:
[0-9]+ {gc_lval.integer=atoi(yytext) ;return INTEGER;}
\"[^\"]+\" {
//copy the string without the quotes
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}

All in one here is the flex file: gc.l
%option prefix="gc_"
%option noyywrap


%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}

%%
\-\-.*\n ;/* ignore comments */

Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {gc_lval.integer=atoi(yytext) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];

%%
... the bison file gc.y:
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}

%union {
char* s;
int integer;
GeneticCode gCode;
}

%name-prefix="gc_"
%defines
%error-verbose

%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT

%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input

%{
void yyerror(const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}

%}

%%

input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};

codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;


code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%

int main(int argc,char** argv)
{
yyparse();
return 0;
}
Compiling:
flex gc.l
bison gc.y
gcc -o a.out gc.tab.c lex.gc_.c
Testing:
cat gc.ptr |./a.out | ./a.out |./a.out |./a.out

Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}

That's it.
Pierre

In the next post I'll write a reentrant parser using flex and bison.

No comments: