Source code for icepop.geo

#!/usr/bin/env python
""" 
Stores method to parse the `GEO <http://www.ncbi.nlm.nih.gov/geo/>`_ file
in various formats. These functions are inherited
from `GEOparse <https://geoparse.readthedocs.org>`_ package.
"""
__author__    = "Edward Wijaya <ewijaya@gmail.com>"
__copyright__ = "Copyright 2015"
import json
import GEOparse
import pandas as pd

[docs]def load(geoid=None, destdir=None, filepath=None): """ Function to download GEO file in SOFT format. :param geo: string, GEO id (e.g. "GSE69886") :param destdir: string, destination to store the files. Usage: >>> from icepop import geo >>> gse = geo.load(geoid="GSE74306", destdir="./") or if the data is already downloaded >>> gse = geo.load(filepath="./GSE74306.soft.gz") """ gse = None if filepath: gse = GEOparse.get_GEO(filepath=filepath) else: gse = GEOparse.get_GEO(geo=geoid, destdir=destdir) return gse
[docs]def get_gpl(handle=None,id=None): """ A function to return meta data from the given GSE object. :param id:int, List id of which you want to return the GPL. """ if id: return handle.gpls.values()[id] else: return handle.gpls.values()[0]
[docs]def iterate(handle=None, type="gsm",anncol="Gene Symbol"): """ Functions to iterate content of GEO object. :param handle: a GSE handler. :param type: string, iterator type. :param anncol: string, annotation column. Derived from GPL class. :returns: Iterator that holds GSM name and expression values data frame. Probe names and gene symbols are included. Usage: >>> for gsm_name, gsm_df in geo.iterate_gsm(handle=gse): >>> print gsm_name >>> print gsm_df.head() """ if type=="gsm": gpl = get_gpl(handle=handle) for gsm_name, gsm in handle.gsms.iteritems(): # yield gsm_name, gsm annotation_column = anncol outdf = gsm.annotate(gpl, annotation_column) outdf = outdf[["ID_REF",annotation_column, "VALUE"]] yield gsm_name, outdf else: pass
[docs]def accumulate(handle=None, type="gsm",anncol="Gene Symbol",gpl_id=None): """ Instead of iterating content of GEO object, we return one single Data frame with combined GSMs. :param handle: a GSE handler. :param anncol: string, annotation column. Derived from GPL class. :param type: string, iterator type. :param gpl_id: int, index of GPL id you want to return. :returns: Data frame from multiple GSMs name and expression values data frame. Probe names and gene symbols are included. Usage: >>> full_df = geo.accumulate(handle=gse,type="gsm",anncol="Gene Symbol") """ if type=="gsm": gpl = get_gpl(handle=handle,id=gpl_id) # print dir(gpl) # print gpl.table[["ID",anncol]].head() # print gpl.show_columns all_dfs = [] annotation_column = anncol for gsm_name, gsm in handle.gsms.iteritems(): # yield gsm_name, gsm outdf = gsm.annotate(gpl, annotation_column) outdf = outdf[["ID_REF",annotation_column, "VALUE"]] outdf.columns = ["ID_REF",annotation_column, gsm_name] if outdf.empty: continue # print gsm_name # print outdf.head() all_dfs.append(outdf) merged_df = reduce(lambda ldf, rdf: pd.merge(ldf,rdf, \ on=["ID_REF", annotation_column]), all_dfs).fillna("NoSymbol") # Keep only rows that has gene symbol unwanted = ["---","NoSymbol"] merged_df = merged_df[~merged_df[annotation_column].isin(unwanted)] return merged_df else: pass
if __name__ == '__main__': main()