Source code for owmeta.document

from six.moves.urllib.parse import urlparse, urlencode
import re
import logging

from owmeta_core.graph_object import IdentifierMissingException
from owmeta_core.context import Context
import owmeta_core.dataobject_property as DP
from owmeta_core.dataobject import DataObject, DatatypeProperty, Alias, BaseDataObject
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from . import SCI_CTX
from . import bibtex as BIB


logger = logging.getLogger(__name__)


[docs]class WormbaseRetrievalException(Exception):
    pass


[docs]class PubmedRetrievalException(Exception):
    pass


# A little bit about why this a separate type from Document:
#
# This type corresponds to a document which has some statements that we care
# about. The key reason this is distinct from Document is that a document need
# not provide evidence of anything. For example, the `WormData.n4` file
# generated by insert_worm.py is a document, but it doesn't provide any
# scientific or logical justification for any of the statements made within it.
[docs]class BaseDocument(DataObject):
    class_context = SCI_CTX

    def make_context_identifier(self):
        return self.make_identifier(self.identifier)

    @property
    def as_context(self):
        if self.context is not None:
            return Context.contextualize(self.context)(ident=self.make_context_identifier())
        else:
            return Context(ident=self.make_context_identifier())


[docs]class Document(BaseDocument):
    """
    A representation of some document.

    Possible keys include::

        pmid, pubmed: a pubmed id or url (e.g., 24098140)
        wbid, wormbase: a wormbase id or url (e.g., WBPaper00044287)
        doi: a Digitial Object id or url (e.g., s00454-010-9273-0)
        uri: a URI specific to the document, preferably usable for accessing
             the document
    """

    class_context = SCI_CTX

    author = DatatypeProperty(multiple=True)
    ''' An author of the document '''

    doi = DatatypeProperty()
    ''' A Digital Object Identifier (DOI), optional '''

    uri = DatatypeProperty(multiple=True)
    ''' A non-standard URI for the document '''

    wbid = DatatypeProperty()
    ''' An ID from WormBase.org that points to a record, optional '''

    wormbaseid = Alias(wbid)
    ''' An alias to `wbid` '''

    pmid = DatatypeProperty()
    ''' A PubMed ID (PMID) that points to a paper '''

    year = DatatypeProperty()
    ''' The year (e.g., publication year) of the document '''

    date = Alias(year)
    ''' Alias to year '''

    title = DatatypeProperty()
    ''' The title of the document '''

    def __init__(
            self,
            bibtex=None,
            doi=None,
            pubmed=None,
            wormbase=None,
            **kwargs):
        """
        Parameters
        ----------
        bibtex : string
            A string containing a single BibTeX entry. Parsed during initialization, but not saved thereafter. optional
        doi : string
            A Digital Object Identifier (DOI). optional
        pubmed : string
            A PubMed ID (PMID) or URL that points to a paper. Ignored if 'pmid' is provided. optional
        wormbase : string
            An ID or URL from WormBase that points to a record. Ignored if `wbid` or `wormbaseid` are provided. optional
        """
        super(Document, self).__init__(**kwargs)

        self.id_precedence = ('doi', 'pmid', 'wbid', 'uri')

        if bibtex is not None:
            self.update_with_bibtex(bibtex)

        if pubmed is not None and not self.pmid.has_defined_value():
            if pubmed[:4] == 'http':
                _tmp = _pubmed_uri_to_pmid(pubmed)
                if _tmp is None:
                    raise ValueError("Couldn't convert Pubmed URL to a PubMed ID")
                pmid = _tmp
            else:
                pmid = pubmed
            self.pmid.set(pmid)

        if wormbase is not None and not self.wbid.has_defined_value():
            if wormbase[:4] == 'http':
                _tmp = _wormbase_uri_to_wbid(wormbase)
                if _tmp is None:
                    raise ValueError("Couldn't convert Wormbase URL to a Wormbase ID")
                wbid = _tmp
            else:
                wbid = wormbase
            self.wbid.set(wbid)

        if doi is not None:
            if doi[:4] == 'http':
                _tmp = _doi_uri_to_doi(doi)
                if _tmp is not None:
                    doi = _tmp
            self.doi.set(doi)

    def update_with_bibtex(self, bibtex):
        bib_db = BIB.loads(bibtex)
        if len(bib_db.entries) > 1:
            raise ValueError('The given BibTex string has %d entries.'
                             ' Cannot determine which entry to use for the document' % len(bib_db))
        BIB.update_document_with_bibtex(self, bib_db.entries[0])

[docs]    def defined_augment(self):
        for x in self.id_precedence:
            if getattr(self, x).has_defined_value():
                return True
        return False

[docs]    def identifier_augment(self):
        for idKind in self.id_precedence:
            idprop = getattr(self, idKind)
            if idprop.has_defined_value():
                s = str(idKind) + ":" + idprop.defined_values[0].identifier.n3()
                return self.make_identifier(s)
        raise IdentifierMissingException(self)

[docs]    def update_from_wormbase(self, replace_existing=False, **kwargs):
        """ Queries WormBase.org for additional data to fill in the `Document`.

        If replace_existing is set to `True`, then existing values will be cleared.

        Parameters
        ----------
        replace_existing : bool
            Whether to replace values that are already set for a given property
        **kwargs
            Passed on as arguments to `requests.Session.get`
        """

        # XXX: wormbase's REST API is pretty sparse in terms of data provided.
        #     Would be better off using AQL or the perl interface
        # _Very_ few of these have these fields filled in
        wbid = self.wbid.defined_values
        if len(wbid) == 1:
            wbid = wbid[0].identifier.toPython()

            # get the author
            try:
                root = self.conf.get('wormbase_api_root_url', 'http://rest.wormbase.org')
                url = f'{root}/rest/widget/paper/{wbid}/overview?content-type=application%2Fjson'
                j = _json_request(url, **kwargs)
                if 'fields' in j:
                    f = j['fields']
                    if 'authors' in f:
                        dat = f['authors']['data']
                        if dat is not None:
                            if replace_existing and self.author.has_defined_value:
                                self.author.clear()
                            for x in dat:
                                self.author.set(x['label'])

                    for fname in ('pmid', 'year', 'title', 'doi'):
                        if fname in f and f[fname]['data'] is not None:
                            attr = getattr(self, fname)
                            if replace_existing and attr.has_defined_value:
                                attr.clear()
                            attr.set(f[fname]['data'])
            except Exception:
                logger.warning("Couldn't retrieve Wormbase data", exc_info=True)
        elif len(wbid) == 0:
            raise WormbaseRetrievalException("There is no Wormbase ID attached to this Document."
                                             " So no data can be retrieved")
        else:
            raise WormbaseRetrievalException("There is more than one Wormbase ID attached to this Document."
                                             " Please try with just one Wormbase ID")

    def _crossref_doi_extract(self):
        # Extract data from crossref
        def crRequest(doi):
            data = {'q': doi}
            data_encoded = urlencode(data)
            return _json_request(
                'http://search.labs.crossref.org/dois?%s' %
                data_encoded)

        doi = self.doi()
        if doi[:4] == 'http':
            doi = _doi_uri_to_doi(doi)
        try:
            r = crRequest(doi)
        except Exception:
            logger.warning("Couldn't retrieve Crossref info", exc_info=True)
            return
        # XXX: I don't think coins is meant to be used, but it has structured
        # data...
        if len(r) > 0:
            extra_data = r[0]['coins'].split('&amp;')
            fields = (x.split("=") for x in extra_data)
            fields = [[y.replace('+', ' ').strip() for y in x] for x in fields]
            authors = [x[1] for x in fields if x[0] == 'rft.au']
            for a in authors:
                self.author(a)
            # no error for bad ids, just an empty list
            if len(r) > 0:
                # Crossref can process multiple doi's at one go and return the
                # metadata. we just need the first one
                r = r[0]
                if 'title' in r:
                    self.title(r['title'])
                if 'year' in r:
                    self.year(r['year'])

[docs]    def update_from_pubmed(self, read_size=2**16, **kwargs):
        '''
        Update the document attributes from NCBI Entrez API using the pubmed attribute

        Parameters
        ----------
        chunk_size : int
            The number of bytes to pass to `requests.Response.iter_content`. This *may*
            reduce runtime memory requirements for the request.
        **kwargs
            Passed on as arguments to `requests.Session.get`
        '''

        def pmRequest(pmid):
            import xml.etree.ElementTree as ET  # Python 2.5 and up

            url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?'
                    f'db=pubmed&id={pmid}')
            key = self.get('pubmed.api_key', None)
            if key:
                url += f'&api_key={key}'
            else:
                logger.warning("PubMed API key not defined. API calls will be limited.")

            if 'do_retries' not in kwargs:
                kwargs['do_retries'] = True

            kwargs['stream'] = True

            s = _url_request(url, **kwargs)
            if hasattr(s, 'charset'):
                parser = ET.XMLParser(encoding=s.charset)
            else:
                parser = ET.XMLParser(encoding='UTF-8')

            with s:
                for chunk in s.iter_content(read_size):
                    parser.feed(chunk)
                return parser.close()

        pmid = self.pmid.defined_values
        if len(pmid) == 1:
            pmid = pmid[0].identifier.toPython()
            try:
                tree = pmRequest(pmid)
            except Exception:
                logger.warning("Couldn't retrieve Pubmed info", exc_info=True)
                return
            for x in tree.findall('./DocSum/Item[@Name="AuthorList"]/Item'):
                self.author(x.text)

            for x in tree.findall('./DocSum/Item[@Name="Title"]'):
                self.title(x.text)

            for x in tree.findall('./DocSum/Item[@Name="DOI"]'):
                self.doi(x.text)

            for x in tree.findall('./DocSum/Item[@Name="PubDate"]'):
                self.year(x.text)

        elif len(pmid) == 0:
            raise PubmedRetrievalException('No Pubmed ID is attached to this document. Cannot retrieve Pubmed data')
        else:
            raise PubmedRetrievalException('More than one Pubmed ID is attached to this document.'
                                           ' Please try with just one Pubmed ID')


[docs]class SourcedFrom(DP.ObjectProperty):
    '''
    Indicates which document provided the source for an object
    '''
    class_context = SCI_CTX
    link_name = "sourced_from"
    value_type = BaseDocument
    owner_type = BaseDataObject
    multiple = False
    lazy = True


def _wormbase_uri_to_wbid(uri):
    return str(urlparse(uri).path.split("/")[2])


def _pubmed_uri_to_pmid(uri):
    return str(urlparse(uri).path.split("/")[2])


def _doi_uri_to_doi(uri):
    # DOI URL to DOI translation is complicated. This is a cop-out.
    parsed = urlparse(uri)
    if 'doi.org' in parsed.netloc:
        doi = parsed.path.split("/", 1)[1]
    else:
        doi = None

    return doi


def _url_request(url, requests_session=None, do_retries=False, **kwargs):

    if requests_session is None:
        sess = requests.Session()
    else:
        sess = requests_session

    if do_retries:
        retries = Retry()
        adapter = HTTPAdapter(max_retries=retries)
        sess.mount('http://', adapter)
        sess.mount('https://', adapter)

    if 'timeout' not in kwargs:
        kwargs['timeout'] = 1

    try:
        resp = sess.get(url, **kwargs)
        if resp.status_code != 200:
            raise Exception(f'Service returned status code {resp.status_code}')
        content_type = resp.headers.get('content-type')
        if content_type:
            md = re.search("charset *= *([^ ]+)", content_type)
            if md:
                resp.charset = md.group(1)

        return resp
    except Exception:
        logger.error("Error in request for %s", url, exc_info=True)
        raise


def _json_request(url, **kwargs):
    if 'headers' in kwargs:
        headers = kwargs['headers']
    else:
        headers = {}
        kwargs['headers'] = headers
    headers['Accept'] = 'application/json'
    try:
        return _url_request(url, **kwargs).json()
    except BaseException:
        logger.warning("Couldn't retrieve JSON data from %s", url,
                       exc_info=True)
        return {}