23 February 2010

A Mysql user defined function (UDF) for Gene Ontology (GO)

In this post I'll create a Mysql Defined function (UDF) answering if a defined GO term is a descendant of another term. This post is not much different from two previous posts I wrote here:

Here I built a binary file containing an array of pairs(term,parent_term) of GO identifiers from the XML/RDF file o_daily-termdb.rdf-xml.gz. This file is then used by the mysql UDF to find all the ancestors of a given term. The source code is available here:

Building the database

The libxml API is used to load go_daily-termdb.rdf-xml.gz. For each go:term, the parent terms are searched and each pair(term,parent-term) is saved into the array.
static void scanterm(xmlNode *term)
{
xmlAttrPtr rsrc;
xmlNode *n=NULL;
xmlChar * acn=NULL;
for(n=term->children; n!=NULL; n=n->next)
{
if (n->type != XML_ELEMENT_NODE) continue;
if(strcmp(n->name,"accession")==0 &&
n->ns!=NULL && n->ns->href!=NULL &&
strcmp(n->ns->href,GO_NS)==0)
{
acn= xmlNodeGetContent(n);
break;
}
}
if(acn==NULL) return;

for(n=term->children; n!=NULL; n=n->next)
{
if (n->type != XML_ELEMENT_NODE) continue;
if(strcmp(n->name,"is_a")==0 &&
n->ns!=NULL && n->ns->href!=NULL &&
strcmp(n->ns->href,GO_NS)==0
)
{
xmlChar* is_a=xmlGetNsProp(n,"resource",RDF_NS);
if(is_a!=NULL)
{
char* p=strstr(is_a,"#GO:");
if(p!=NULL)
{
++p;
termdb.terms=(TermPtr)realloc(termdb.terms,sizeof(Term)*(termdb.n_terms+1));
if(termdb.terms==NULL)
{
fprintf(stderr,"out of memory\n");
exit(EXIT_FAILURE);
}
strncpy(termdb.terms[termdb.n_terms].parent,p,MAX_TERM_LENGTH);
strncpy(termdb.terms[termdb.n_terms].child,acn,MAX_TERM_LENGTH);
++termdb.n_terms;
}
xmlFree(is_a);
}
}
}
xmlFree(acn);
}
When the RDF file has been read, the content of the array is sorted by term and saved to a file
qsort(termdb.terms,termdb.n_terms,sizeof(Term),cmp_term);
out= fopen(argv[1],"w");
if(out==NULL)
{
fprintf(stderr,"Cannot open %s\n",argv[1]);
return -1;
}
fwrite(&(termdb.n_terms),sizeof(int),1,out);
fwrite(termdb.terms,sizeof(Term),termdb.n_terms,out);
fflush(out);
fclose(out);

Initializing the mysql UDF

When the mysql UDF is inited the binary file is loaded into memory (error handling ignored)
my_bool go_isa_init(
UDF_INIT *initid,
UDF_ARGS *args,
char *message
)
{
TermDBPtr termdb;
FILE* in=NULL;

initid->maybe_null=1;
initid->ptr= NULL;

termdb=(TermDBPtr)malloc(sizeof(TermDB));


in=fopen(GO_PATH,"r");
fread(&(termdb->n_terms),sizeof(int),1,in);
termdb->terms=malloc(sizeof(Term)*(termdb->n_terms));
fread(termdb->terms,sizeof(Term),termdb->n_terms,in);
fclose(in);
initid->ptr=(void*)termdb;
return 0;
}

Disposing the mysql UDF

When the UDF is disposed, we just free the memory
/* The deinitialization function */
void go_isa_deinit(UDF_INIT *initid)
{
/* free the memory **/
if(initid->ptr!=NULL)
{
TermDBPtr termdb=(TermDBPtr)initid->ptr;
if(termdb->terms!=NULL) free(termdb->terms);
free(termdb);
initid->ptr=NULL;
}
}

invoking the UDF

The UDF itself will scan the GO directed tree and will get all the ancestors of a given term:
long long go_isa(UDF_INIT *initid, UDF_ARGS *args,
char *is_null, char *error)
{
(...)
index=termdb_findIndexByName(termdb,term);
if(index==-1)
{
return 0;
}
return recursive_search(termdb,index,parent);
}

static int recursive_search(const TermDBPtr db,int index, const char* parent)
{
int rez=0;
int start=index;
int parent_idx=0;

if(start<0 || start>=db->n_terms) return 0;
if(strcmp(db->terms[index].child,parent)==0) return 1;
while(index < db->n_terms)
{
if(strcmp(db->terms[index].child,db->terms[start].child)!=0) break;
if(strcmp(db->terms[index].parent,parent)==0) return 1;
parent_idx= termdb_findIndexByName(db,db->terms[index].parent);
rez= recursive_search(db,parent_idx,parent);
if(rez==1) return 1;
++index;
}
return 0;
}
As the terms have been ordered by their names, a binary_search is used to find the index of a given term:
static int lower_bound(const TermDBPtr termsdb, const char* name)
{
int low = 0;
int len= termsdb->n_terms;

while(len>0)
{
int half=len/2;
int mid=low+half;
if( strncmp(termsdb->terms[mid].child,name,MAX_TERM_LENGTH)<0)
{
low=mid;
++low;
len=len-half-1;
}
else
{
len=half;
}
}
return low;
}


static int termdb_findIndexByName(const TermDBPtr termsdb,const char* name)
{
int i=0;
if(name==NULL || termsdb==NULL || termsdb->terms==NULL || termsdb->n_terms==0) return -1;
i= lower_bound(termsdb,name);
if(i<0 || i >= termsdb->n_terms || strcmp(termsdb->terms[i].child,name)!=0) return -1;
return i;
}

Compiling

On my laptop, the UDF was compiled using
gcc -shared -DGO_PATH='"/tmp/terms.bin"' -I /usr/include -I /usr/local/include -I /usr/include/mysql -o /usr/lib/go_udf.so src/go_udf.c

Creating the function

mysql> create function go_isa RETURNS INTEGER SONAME 'go_udf.so';

Invoking the UDF function

Finf all the GO terms that are a descendant of GO:0016859 (cis-trans isomerase activity):
mysql> select LEFT(T.name,20) as name,T.acc from go_latest.term as T where go_isa(T.acc,"GO:0016859");
+----------------------+------------+
| name | acc |
+----------------------+------------+
| peptidyl-prolyl cis- | GO:0003755 |
| retinal isomerase ac | GO:0004744 |
| maleylacetoacetate i | GO:0016034 |
| cis-trans isomerase | GO:0016859 |
| cis-4-[2-(3-hydroxy) | GO:0018839 |
| trans-geranyl-CoA is | GO:0034872 |
| carotenoid isomerase | GO:0046608 |
| 2-chloro-4-carboxyme | GO:0047466 |
| 4-hydroxyphenylaceta | GO:0047467 |
| farnesol 2-isomerase | GO:0047885 |
| furylfuramide isomer | GO:0047907 |
| linoleate isomerase | GO:0050058 |
| maleate isomerase ac | GO:0050076 |
| maleylpyruvate isome | GO:0050077 |
| retinol isomerase ac | GO:0050251 |
+----------------------+------------+
15 rows in set (1.27 sec)

Disposing the UDF function

drop function go_isa;


That's it
Pierre

No comments: