07 February 2010

Id rather bee sleeping: fast parsing of dbsnp/XML with libxml

Hello world ! I'm now in Japan for Biohackathon 2010 it's 00H30 here and, of course, I cannot sleep. I should be tired : I spent 11H00 in a plane, 1H30 in a train and three hours skating in Tokyo with my kick scooter :-)

I'm killing the time with libxml, the C library for parsing XML. In the current post I'll write a simple SAX parser extracting the flanking sequences from the SNPs of dbSNP/xml.
First we need to include a few files:
#include <libxml/parser.h>
#include <string>
#include <cstring>
#include <iostream>
The the C++ class DBSNPHandler that holds the current state of the parser:
class DBSNPHandler
{
public:

// current rs### id
std::string rs_id;
// current 5' sequence
std::string seq5;
// current observed variation
std::string observed;
// current 3' sequence
std::string seq3;
// current string handler by the SAX handler
std::string* content;
//did we find the sequence ?
bool sequence_found;

DBSNPHandler();
~DBSNPHandler();
void clear();
}
I defined a number of callback methods that will be called when events occur during parsing. 3 callbacks are needed here: when the parser opens an element, when the parse closes an element and when it finds some text. Each callback uses a DBSNPHandler as the value of the user data (ctx);
startElement is called when the parser opens an element. If it is a <Rs> tag then we go threw the attributes to find the rs## id. If it is a component of the sequence then we tell the DBSNPHandler that it should store the text in the 'content' variable.
/** called when an TAG is opened */
void startElement(void * ctx,
const xmlChar * localname,
const xmlChar * prefix,
const xmlChar * URI,
int nb_namespaces,
const xmlChar ** namespaces,
int nb_attributes,
int nb_defaulted,
const xmlChar ** attributes)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(strcmp( (char*) localname,"Rs")==0)
{
state->clear();
//loop over the attributes an find the rs###
for(int i=0;i< nb_attributes;++i)
{
if(strcmp((char*) attributes[i*5],"rsId")!=0) continue;
int len=(char*) attributes[i*5+4]-(char*) attributes[i*5+3];

state->rs_id.assign(
(char*) attributes[i*5+3],
len
);
break;
}
}
else if((strcmp( (char*) localname,"Seq5")==0 ||
strcmp( (char*) localname,"Observed")==0 ||
strcmp( (char*) localname,"Seq3")==0) &&
state->sequence_found==false
)
{
//tells the DBSNPHandler we need to get the next text content
state->content=new std::string;
}
}

handleCharacters is called by the SAX parser it finds some text. This text is appended to the current content if we are parsing the sequence component of the current rs####.
static void handleCharacters(void * ctx, const xmlChar * ch, int len)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(state->content!=NULL)
{
state->content->append((char*)ch,len);
}
}

endElement is called when the parser closes an element. If it is a component of the sequences, the fields (seq5,seq3...) of the current state are updated. It it is a <Rs> element, then its sequence is printed out and the state is re-initialized.
static void endElement(void * ctx,
const xmlChar * localname,
const xmlChar * prefix,
const xmlChar * URI)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(strcmp( (char*) localname,"Rs")==0)
{
std::cout << "rs" << state->rs_id <<
"\t" << state->seq5 <<
"[" << state->observed << "]" <<
state->seq3 << std::endl;

//we're done with this SNP, clear the state
state->clear();
}
else if(state->content!=NULL && strcmp( (char*) localname,"Seq5")==0)
{
state->seq5.assign(*(state->content));
delete state->content;
state->content=NULL;
}
else if(state->content!=NULL && strcmp( (char*) localname,"Observed")==0)
{
state->observed.assign(*(state->content));
delete state->content;
state->content=NULL;
}
else if(state->content!=NULL && strcmp( (char*) localname,"Seq3")==0)
{
state->seq3.assign(*(state->content));
delete state->content;
state->content=NULL;
state->sequence_found=true;
}
}
In the main part, the SAX handler is initialized with the previous callbacks and the xml files are parsed:
xmlSAXHandler handler;
DBSNPHandler state;
(...)
handler.startElementNs= startElement;
handler.endElementNs=endElement;
handler.characters=handleCharacters;

for(int i=1;i< argc;++i)
{
xmlSAXUserParseFile(&handler,&state,argv[i]);
}

Compiling

g++ `xml2-config --cflags --libs` dbsnp.c

Running

a.out ds_ch17.xml.gz
rs69624490 CTCCAGCCCGGGCCCACCCTACAGCCGACACCAAGTTGCGTCACCGTGATCTGGACACCCAGACGTACATTAGAGCTGCTTTCCTTGATGAGCTCAGAACC[C/G]GGTACTCCACCAGCAGGAGCAGGAGGCTCTCCCACCTCCACCTGCCACTGGCCGACAGCTCCCAGTGCGTTCTTCAGGCGCCACTTTCTCGCTGGAGAAAA
rs69626771 GAAATTCCTACAAAAATCCATCTTTGTGGCATCATCAGGGGTGATCTGTCCTTGCAGGAATATCTCACGTCCTCTGTTTGTACCTTGACACGCTTGGCTGA[G/T]CTGTAGTTAAACTGGTAACGAAGGTTCCACCCTTCCTCCCAGGCCACAGCGCCCCCAGCAGTCTTGCACCAGATTCGAATTTATGAACCCACAGCACTTGC
rs69628798 ACCTCAGTAAAATGAAGATTATTACTATGTGCCCTATGCAGGACAGGGACTGTGTTCTGATACAGGCCCTGATTAGGTTAGTACAGTGCCTGGCACATAGT[A/G]TTGAACAGATATCAGAAAAAGGGGGAGAGAGAAATCAGTTGGTTGGGAGGAGAATGAGGGGGGCAGGAGGGTCCTGCAGGGTGTTGCAGGTGAGAAATGAC
rs69633675 GTTGTTGGTGAGGGGAGGGAGTGGGGCAAGAGGAACAGTGTGGTCTAGAGGATAGAGCAAGGGAATGGGAGTCAGAAGGACCTGTGTTCTGGAGAAGCAGC[A/G]TGGCTCAGTGGAAAGAACACGGGCTTTGGAGTCAGAGATCAGGGGTTCGAATCCCGGCTCTGCCACTTGGCAGCTGTGTGACTGTGGGCAAGTCACTTCAC
rs69638381 GGCCACGCTGCTTGTCATAGCGGCTTTCCAGCTCTGCCCTCCGCAGCAAGGGCAGCGCTCACCTAGGCAAGCCCAGAGGGCTTAGGAGGGAGGGGCGGGGC[G/T]GGGCCGTACCGATGAGTTCTCCCGGGGAGAGACCAGGAGCTCTGAGTCAGGAAGGGAATCAAAAGGCGACAGGCTCTCATCATCAGTGTTGCCCAGGATCT
rs69640257 TTTCCTCTTCTCCTTGGCCCTGCATTATCCCCACTATTTGACTTGTCCAGGTCAGCCTTTGTAAATAAAAACAACAGGCTTACCAGCCTACCTTTCTCCCA[A/G]ACTCTTTACCATCACTTATTCTCAAGGCTGGTCAAAACCTTCTCTGTCATTGTCATTTGTTTATTGAGCATCCATCTTCTACAGAGCCCTGGAGTAAGCGT
rs69640260 CTCAAAAGAGAATGGTTCCTTTAGGTCCCTGAGGACACCCCAGGAAGGTTGGGTATCCCTTGCTTCAATTATCAGAGGTAAAAACTCTCTCTCTAAAGGAG[C/T]AGCGTGTCTTAGTGGAAAAAGCACAAGTCTGGGAGTCAGAGTATCTGGGTTCTAATCCCCACCACCCAATGCTTGCTGTGTGACCTTGGGCCTCAGTTACC
rs69641743 TCTATACATTGTTTAGTTCCTCTCCCCCACTAGACTGTAAACACCTTGAGGGCAAGGAGCATCTCTTCTGAATCTGTTATATTCTCACAGGAGCAGCATGG[A/C]CTAGCAAATAGAGCACGGGCCTGGGAATCTGAATAGGTTCTAATCCTGGCTCTGCCACTTGTCTGCTATGGGACCTTGGGCAAATGATTTAACTTCTCTGT
(...)

The code

#include <libxml/parser.h>
#include <string>
#include <cstring>
#include <iostream>
/**
* Holds the state of the parser
*/
class DBSNPHandler
{
public:
// current rs### id
std::string rs_id;
// current 5' sequence
std::string seq5;
// current observed variation
std::string observed;
// current 3' sequence
std::string seq3;
// current string handler by the SAX handler
std::string* content;
//did we find the sequence ?
bool sequence_found;

DBSNPHandler():content(NULL),sequence_found(false)
{
}

~DBSNPHandler()
{
clear();
}

void clear()
{
if(content!=NULL) delete content;
content=NULL;
sequence_found=false;
rs_id.clear();
seq5.clear();
observed.clear();
seq3.clear();
content=NULL;
}
};

/** called when an TAG is opened */
static void startElement(void * ctx,
const xmlChar * localname,
const xmlChar * prefix,
const xmlChar * URI,
int nb_namespaces,
const xmlChar ** namespaces,
int nb_attributes,
int nb_defaulted,
const xmlChar ** attributes)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(strcmp( (char*) localname,"Rs")==0)
{
state->clear();

for(int i=0;i< nb_attributes;++i)
{
if(strcmp((char*) attributes[i*5],"rsId")!=0) continue;
int len=(char*) attributes[i*5+4]-(char*) attributes[i*5+3];

state->rs_id.assign(
(char*) attributes[i*5+3],
len
);
break;
}
}
else if((strcmp( (char*) localname,"Seq5")==0 ||
strcmp( (char*) localname,"Observed")==0 ||
strcmp( (char*) localname,"Seq3")==0) &&
state->sequence_found==false
)
{
state->content=new std::string;
}
}


/** called when an TAG is closed */
static void endElement(void * ctx,
const xmlChar * localname,
const xmlChar * prefix,
const xmlChar * URI)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(strcmp( (char*) localname,"Rs")==0)
{
std::cout << "rs" << state->rs_id <<
"\t" << state->seq5 <<
"[" << state->observed << "]" <<
state->seq3 << std::endl;

//we're done with this SNP, clear the state
state->clear();
}
else if(state->content!=NULL && strcmp( (char*) localname,"Seq5")==0)
{
state->seq5.assign(*(state->content));
delete state->content;
state->content=NULL;
}
else if(state->content!=NULL && strcmp( (char*) localname,"Observed")==0)
{
state->observed.assign(*(state->content));
delete state->content;
state->content=NULL;
}
else if(state->content!=NULL && strcmp( (char*) localname,"Seq3")==0)
{
state->seq3.assign(*(state->content));
delete state->content;
state->content=NULL;
state->sequence_found=true;
}
}

static void handleCharacters(void * ctx, const xmlChar * ch, int len)
{
DBSNPHandler* state=(DBSNPHandler*)ctx;
if(state->content!=NULL)
{
state->content->append((char*)ch,len);
}
}


int main(int argc,char **argv)
{
int res;
LIBXML_TEST_VERSION
xmlSAXHandler handler;
DBSNPHandler state;

memset(&handler,0,sizeof(xmlSAXHandler));


handler.startElementNs= startElement;
handler.endElementNs=endElement;
handler.characters=handleCharacters;
handler.initialized = XML_SAX2_MAGIC;

for(int i=1;i< argc;++i)
{
res=xmlSAXUserParseFile(&handler,&state,argv[i]);

if(res!=0)
{
std::cerr << "Error "<< res << argv[i] << std::endl;
}
}
xmlCleanupParser();
xmlMemoryDump();

return(0);
}


That's it !
Pierre

No comments: