Source code for owmeta.document

from six.moves.urllib.parse import urlparse, urlencode
from six.moves.urllib.request import Request, urlopen
from six.moves.urllib.error import HTTPError, URLError
import re
import logging

from owmeta_core.graph_object import IdentifierMissingException
from owmeta_core.context import Context
from owmeta_core.dataobject import DataObject, DatatypeProperty, Alias

from . import SCI_CTX
from . import bibtex as BIB

logger = logging.getLogger(__name__)


[docs]class WormbaseRetrievalException(Exception):
    pass


[docs]class PubmedRetrievalException(Exception):
    pass


# A little bit about why this a separate type from Document:
#
# This type corresponds to a document which has some statements that we care
# about. The key reason this is distinct from Document is that a document need
# not provide evidence of anything. For example, the `WormData.n4` file
# generated by insert_worm.py is a document, but it doesn't provide any
# scientific or logical justification for any of the statements made within it.
[docs]class BaseDocument(DataObject):
    class_context = SCI_CTX

    def make_context_identifier(self):
        return self.make_identifier(self.identifier)

    @property
    def as_context(self):
        if self.context is not None:
            return Context.contextualize(self.context)(ident=self.make_context_identifier())
        else:
            return Context(ident=self.make_context_identifier())


[docs]class Document(BaseDocument):
    """
    A representation of some document.

    Possible keys include::

        pmid, pubmed: a pubmed id or url (e.g., 24098140)
        wbid, wormbase: a wormbase id or url (e.g., WBPaper00044287)
        doi: a Digitial Object id or url (e.g., s00454-010-9273-0)
        uri: a URI specific to the document, preferably usable for accessing
             the document
    """

    class_context = SCI_CTX

    author = DatatypeProperty(multiple=True)
    ''' An author of the document '''

    doi = DatatypeProperty()
    ''' A Digital Object Identifier (DOI), optional '''

    uri = DatatypeProperty(multiple=True)
    ''' A non-standard URI for the document '''

    wbid = DatatypeProperty()
    ''' An ID from WormBase.org that points to a record, optional '''

    wormbaseid = Alias(wbid)
    ''' An alias to `wbid` '''

    pmid = DatatypeProperty()
    ''' A PubMed ID (PMID) that points to a paper '''

    year = DatatypeProperty()
    ''' The year (e.g., publication year) of the document '''

    date = Alias(year)
    ''' Alias to year '''

    title = DatatypeProperty()
    ''' The title of the document '''

    def __init__(
            self,
            bibtex=None,
            doi=None,
            pubmed=None,
            wormbase=None,
            **kwargs):
        """
        Parameters
        ----------
        bibtex : string
            A string containing a single BibTeX entry. Parsed during initialization, but not saved thereafter. optional
        doi : string
            A Digital Object Identifier (DOI). optional
        pubmed : string
            A PubMed ID (PMID) or URL that points to a paper. Ignored if 'pmid' is provided. optional
        wormbase : string
            An ID or URL from WormBase that points to a record. Ignored if `wbid` or `wormbaseid` are provided. optional
        """
        super(Document, self).__init__(**kwargs)

        self.id_precedence = ('doi', 'pmid', 'wbid', 'uri')

        if bibtex is not None:
            self.update_with_bibtex(bibtex)

        if pubmed is not None and not self.pmid.has_defined_value():
            if pubmed[:4] == 'http':
                _tmp = _pubmed_uri_to_pmid(pubmed)
                if _tmp is None:
                    raise ValueError("Couldn't convert Pubmed URL to a PubMed ID")
                pmid = _tmp
            else:
                pmid = pubmed
            self.pmid.set(pmid)

        if wormbase is not None and not self.wbid.has_defined_value():
            if wormbase[:4] == 'http':
                _tmp = _wormbase_uri_to_wbid(wormbase)
                if _tmp is None:
                    raise ValueError("Couldn't convert Wormbase URL to a Wormbase ID")
                wbid = _tmp
            else:
                wbid = wormbase
            self.wbid.set(wbid)

        if doi is not None:
            if doi[:4] == 'http':
                _tmp = _doi_uri_to_doi(doi)
                if _tmp is not None:
                    doi = _tmp
            self.doi.set(doi)

    def update_with_bibtex(self, bibtex):
        bib_db = BIB.loads(bibtex)
        if len(bib_db.entries) > 1:
            raise ValueError('The given BibTex string has %d entries.'
                             ' Cannot determine which entry to use for the document' % len(bib_db))
        BIB.update_document_with_bibtex(self, bib_db.entries[0])

[docs]    def defined_augment(self):
        for x in self.id_precedence:
            if getattr(self, x).has_defined_value():
                return True
        return False

[docs]    def identifier_augment(self):
        for idKind in self.id_precedence:
            idprop = getattr(self, idKind)
            if idprop.has_defined_value():
                s = str(idKind) + ":" + idprop.defined_values[0].identifier.n3()
                return self.make_identifier(s)
        raise IdentifierMissingException(self)

    # TODO: Provide a way to override modification of already set values.
[docs]    def update_from_wormbase(self, replace_existing=False):
        """ Queries wormbase for additional data to fill in the Document.

        If replace_existing is set to `True`, then existing values will be cleared.
        """

        # XXX: wormbase's REST API is pretty sparse in terms of data provided.
        #     Would be better off using AQL or the perl interface
        # _Very_ few of these have these fields filled in
        wbid = self.wbid.defined_values
        if len(wbid) == 1:
            wbid = wbid[0].identifier.toPython()

            # get the author
            try:
                root = self.conf.get('wormbase_api_root_url', 'http://rest.wormbase.org')
                url = root + '/rest/widget/paper/' + str(wbid) + '/overview?content-type=application%2Fjson'
                j = _json_request(url)
                if 'fields' in j:
                    f = j['fields']
                    if 'authors' in f:
                        dat = f['authors']['data']
                        if dat is not None:
                            if replace_existing and self.author.has_defined_value:
                                self.author.clear()
                            for x in dat:
                                self.author.set(x['label'])

                    for fname in ('pmid', 'year', 'title', 'doi'):
                        if fname in f and f[fname]['data'] is not None:
                            attr = getattr(self, fname)
                            if replace_existing and attr.has_defined_value:
                                attr.clear()
                            attr.set(f[fname]['data'])
            except Exception:
                logger.warning("Couldn't retrieve Wormbase data", exc_info=True)
        elif len(wbid) == 0:
            raise WormbaseRetrievalException("There is no Wormbase ID attached to this Document."
                                             " So no data can be retrieved")
        else:
            raise WormbaseRetrievalException("There is more than one Wormbase ID attached to this Document."
                                             " Please try with just one Wormbase ID")

    def _crossref_doi_extract(self):
        # Extract data from crossref
        def crRequest(doi):
            data = {'q': doi}
            data_encoded = urlencode(data)
            return _json_request(
                'http://search.labs.crossref.org/dois?%s' %
                data_encoded)

        doi = self.doi()
        if doi[:4] == 'http':
            doi = _doi_uri_to_doi(doi)
        try:
            r = crRequest(doi)
        except Exception:
            logger.warning("Couldn't retrieve Crossref info", exc_info=True)
            return
        # XXX: I don't think coins is meant to be used, but it has structured
        # data...
        if len(r) > 0:
            extra_data = r[0]['coins'].split('&amp;')
            fields = (x.split("=") for x in extra_data)
            fields = [[y.replace('+', ' ').strip() for y in x] for x in fields]
            authors = [x[1] for x in fields if x[0] == 'rft.au']
            for a in authors:
                self.author(a)
            # no error for bad ids, just an empty list
            if len(r) > 0:
                # Crossref can process multiple doi's at one go and return the
                # metadata. we just need the first one
                r = r[0]
                if 'title' in r:
                    self.title(r['title'])
                if 'year' in r:
                    self.year(r['year'])

    def update_from_pubmed(self):
        def pmRequest(pmid):
            import xml.etree.ElementTree as ET  # Python 2.5 and up
            url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=' + str(pmid)
            key = self.get('pubmed.api_key', None)
            if key:
                url += '&api_key=' + key
            else:
                logger.warning("PubMed API key not defined. API calls will be limited.")
            s = _url_request(url)
            if hasattr(s, 'charset'):
                parser = ET.XMLParser(encoding=s.charset)
            else:
                parser = None

            return ET.parse(s, parser)

        pmid = self.pmid.defined_values
        if len(pmid) == 1:
            pmid = pmid[0].identifier.toPython()
            try:
                tree = pmRequest(pmid)
            except Exception:
                logger.warning("Couldn't retrieve Pubmed info", exc_info=True)
                return
            for x in tree.findall('./DocSum/Item[@Name="AuthorList"]/Item'):
                self.author(x.text)

            for x in tree.findall('./DocSum/Item[@Name="Title"]'):
                self.title(x.text)

            for x in tree.findall('./DocSum/Item[@Name="DOI"]'):
                self.doi(x.text)

            for x in tree.findall('./DocSum/Item[@Name="PubDate"]'):
                self.year(x.text)

        elif len(pmid) == 0:
            raise PubmedRetrievalException('No Pubmed ID is attached to this document. Cannot retrieve Pubmed data')
        else:
            raise PubmedRetrievalException('More than one Pubmed ID is attached to this document.'
                                           ' Please try with just one Pubmed ID')


def _wormbase_uri_to_wbid(uri):
    return str(urlparse(uri).path.split("/")[2])


def _pubmed_uri_to_pmid(uri):
    return str(urlparse(uri).path.split("/")[2])


def _doi_uri_to_doi(uri):
    # DOI URL to DOI translation is complicated. This is a cop-out.
    parsed = urlparse(uri)
    if 'doi.org' in parsed.netloc:
        doi = parsed.path.split("/", 1)[1]
    else:
        doi = None

    return doi


class EmptyRes(object):
    def read(self):
        return bytes()


def _url_request(url, headers={}):
    try:
        r = Request(url, headers=headers)
        s = urlopen(r, timeout=1)
        info = dict(s.info())
        content_type = {k.lower(): info[k] for k in info}['content-type']
        md = re.search("charset *= *([^ ]+)", content_type)
        if md:
            s.charset = md.group(1)

        return s
    except HTTPError:
        logger.error("Error in request for {}".format(url), exc_info=True)
        return EmptyRes()
    except URLError:
        logger.error("Error in request for {}".format(url), exc_info=True)
        return EmptyRes()


def _json_request(url):
    import json
    headers = {'Accept': 'application/json'}
    try:
        data = _url_request(url, headers).read().decode('UTF-8')
        if hasattr(data, 'charset'):
            return json.loads(data, encoding=data.charset)
        else:
            return json.loads(data)
    except BaseException:
        logger.warning("Couldn't retrieve JSON data from " + url,
                       exc_info=True)
        return {}