Source code for owmeta.document

from six.moves.urllib.parse import urlparse, urlencode
import re
import logging

from owmeta_core.graph_object import IdentifierMissingException
from owmeta_core.context import Context
import owmeta_core.dataobject_property as DP
from owmeta_core.dataobject import DataObject, DatatypeProperty, Alias, BaseDataObject
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from . import SCI_CTX
from . import bibtex as BIB


logger = logging.getLogger(__name__)


[docs]class WormbaseRetrievalException(Exception): pass
[docs]class PubmedRetrievalException(Exception): pass
# A little bit about why this a separate type from Document: # # This type corresponds to a document which has some statements that we care # about. The key reason this is distinct from Document is that a document need # not provide evidence of anything. For example, the `WormData.n4` file # generated by insert_worm.py is a document, but it doesn't provide any # scientific or logical justification for any of the statements made within it.
[docs]class BaseDocument(DataObject): class_context = SCI_CTX def make_context_identifier(self): return self.make_identifier(self.identifier) @property def as_context(self): if self.context is not None: return Context.contextualize(self.context)(ident=self.make_context_identifier()) else: return Context(ident=self.make_context_identifier())
[docs]class Document(BaseDocument): """ A representation of some document. Possible keys include:: pmid, pubmed: a pubmed id or url (e.g., 24098140) wbid, wormbase: a wormbase id or url (e.g., WBPaper00044287) doi: a Digitial Object id or url (e.g., s00454-010-9273-0) uri: a URI specific to the document, preferably usable for accessing the document """ class_context = SCI_CTX author = DatatypeProperty(multiple=True) ''' An author of the document ''' doi = DatatypeProperty() ''' A Digital Object Identifier (DOI), optional ''' uri = DatatypeProperty(multiple=True) ''' A non-standard URI for the document ''' wbid = DatatypeProperty() ''' An ID from WormBase.org that points to a record, optional ''' wormbaseid = Alias(wbid) ''' An alias to `wbid` ''' pmid = DatatypeProperty() ''' A PubMed ID (PMID) that points to a paper ''' year = DatatypeProperty() ''' The year (e.g., publication year) of the document ''' date = Alias(year) ''' Alias to year ''' title = DatatypeProperty() ''' The title of the document ''' def __init__( self, bibtex=None, doi=None, pubmed=None, wormbase=None, **kwargs): """ Parameters ---------- bibtex : string A string containing a single BibTeX entry. Parsed during initialization, but not saved thereafter. optional doi : string A Digital Object Identifier (DOI). optional pubmed : string A PubMed ID (PMID) or URL that points to a paper. Ignored if 'pmid' is provided. optional wormbase : string An ID or URL from WormBase that points to a record. Ignored if `wbid` or `wormbaseid` are provided. optional """ super(Document, self).__init__(**kwargs) self.id_precedence = ('doi', 'pmid', 'wbid', 'uri') if bibtex is not None: self.update_with_bibtex(bibtex) if pubmed is not None and not self.pmid.has_defined_value(): if pubmed[:4] == 'http': _tmp = _pubmed_uri_to_pmid(pubmed) if _tmp is None: raise ValueError("Couldn't convert Pubmed URL to a PubMed ID") pmid = _tmp else: pmid = pubmed self.pmid.set(pmid) if wormbase is not None and not self.wbid.has_defined_value(): if wormbase[:4] == 'http': _tmp = _wormbase_uri_to_wbid(wormbase) if _tmp is None: raise ValueError("Couldn't convert Wormbase URL to a Wormbase ID") wbid = _tmp else: wbid = wormbase self.wbid.set(wbid) if doi is not None: if doi[:4] == 'http': _tmp = _doi_uri_to_doi(doi) if _tmp is not None: doi = _tmp self.doi.set(doi) def update_with_bibtex(self, bibtex): bib_db = BIB.loads(bibtex) if len(bib_db.entries) > 1: raise ValueError('The given BibTex string has %d entries.' ' Cannot determine which entry to use for the document' % len(bib_db)) BIB.update_document_with_bibtex(self, bib_db.entries[0])
[docs] def defined_augment(self): for x in self.id_precedence: if getattr(self, x).has_defined_value(): return True return False
[docs] def identifier_augment(self): for idKind in self.id_precedence: idprop = getattr(self, idKind) if idprop.has_defined_value(): s = str(idKind) + ":" + idprop.defined_values[0].identifier.n3() return self.make_identifier(s) raise IdentifierMissingException(self)
[docs] def update_from_wormbase(self, replace_existing=False, **kwargs): """ Queries WormBase.org for additional data to fill in the `Document`. If replace_existing is set to `True`, then existing values will be cleared. Parameters ---------- replace_existing : bool Whether to replace values that are already set for a given property **kwargs Passed on as arguments to `requests.Session.get` """ # XXX: wormbase's REST API is pretty sparse in terms of data provided. # Would be better off using AQL or the perl interface # _Very_ few of these have these fields filled in wbid = self.wbid.defined_values if len(wbid) == 1: wbid = wbid[0].identifier.toPython() # get the author try: root = self.conf.get('wormbase_api_root_url', 'http://rest.wormbase.org') url = f'{root}/rest/widget/paper/{wbid}/overview?content-type=application%2Fjson' j = _json_request(url, **kwargs) if 'fields' in j: f = j['fields'] if 'authors' in f: dat = f['authors']['data'] if dat is not None: if replace_existing and self.author.has_defined_value: self.author.clear() for x in dat: self.author.set(x['label']) for fname in ('pmid', 'year', 'title', 'doi'): if fname in f and f[fname]['data'] is not None: attr = getattr(self, fname) if replace_existing and attr.has_defined_value: attr.clear() attr.set(f[fname]['data']) except Exception: logger.warning("Couldn't retrieve Wormbase data", exc_info=True) elif len(wbid) == 0: raise WormbaseRetrievalException("There is no Wormbase ID attached to this Document." " So no data can be retrieved") else: raise WormbaseRetrievalException("There is more than one Wormbase ID attached to this Document." " Please try with just one Wormbase ID")
def _crossref_doi_extract(self): # Extract data from crossref def crRequest(doi): data = {'q': doi} data_encoded = urlencode(data) return _json_request( 'http://search.labs.crossref.org/dois?%s' % data_encoded) doi = self.doi() if doi[:4] == 'http': doi = _doi_uri_to_doi(doi) try: r = crRequest(doi) except Exception: logger.warning("Couldn't retrieve Crossref info", exc_info=True) return # XXX: I don't think coins is meant to be used, but it has structured # data... if len(r) > 0: extra_data = r[0]['coins'].split('&') fields = (x.split("=") for x in extra_data) fields = [[y.replace('+', ' ').strip() for y in x] for x in fields] authors = [x[1] for x in fields if x[0] == 'rft.au'] for a in authors: self.author(a) # no error for bad ids, just an empty list if len(r) > 0: # Crossref can process multiple doi's at one go and return the # metadata. we just need the first one r = r[0] if 'title' in r: self.title(r['title']) if 'year' in r: self.year(r['year'])
[docs] def update_from_pubmed(self, read_size=2**16, **kwargs): ''' Update the document attributes from NCBI Entrez API using the pubmed attribute Parameters ---------- chunk_size : int The number of bytes to pass to `requests.Response.iter_content`. This *may* reduce runtime memory requirements for the request. **kwargs Passed on as arguments to `requests.Session.get` ''' def pmRequest(pmid): import xml.etree.ElementTree as ET # Python 2.5 and up url = ('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?' f'db=pubmed&id={pmid}') key = self.get('pubmed.api_key', None) if key: url += f'&api_key={key}' else: logger.warning("PubMed API key not defined. API calls will be limited.") if 'do_retries' not in kwargs: kwargs['do_retries'] = True kwargs['stream'] = True s = _url_request(url, **kwargs) if hasattr(s, 'charset'): parser = ET.XMLParser(encoding=s.charset) else: parser = ET.XMLParser(encoding='UTF-8') with s: for chunk in s.iter_content(read_size): parser.feed(chunk) return parser.close() pmid = self.pmid.defined_values if len(pmid) == 1: pmid = pmid[0].identifier.toPython() try: tree = pmRequest(pmid) except Exception: logger.warning("Couldn't retrieve Pubmed info", exc_info=True) return for x in tree.findall('./DocSum/Item[@Name="AuthorList"]/Item'): self.author(x.text) for x in tree.findall('./DocSum/Item[@Name="Title"]'): self.title(x.text) for x in tree.findall('./DocSum/Item[@Name="DOI"]'): self.doi(x.text) for x in tree.findall('./DocSum/Item[@Name="PubDate"]'): self.year(x.text) elif len(pmid) == 0: raise PubmedRetrievalException('No Pubmed ID is attached to this document. Cannot retrieve Pubmed data') else: raise PubmedRetrievalException('More than one Pubmed ID is attached to this document.' ' Please try with just one Pubmed ID')
[docs]class SourcedFrom(DP.ObjectProperty): ''' Indicates which document provided the source for an object ''' class_context = SCI_CTX link_name = "sourced_from" value_type = BaseDocument owner_type = BaseDataObject multiple = False lazy = True
def _wormbase_uri_to_wbid(uri): return str(urlparse(uri).path.split("/")[2]) def _pubmed_uri_to_pmid(uri): return str(urlparse(uri).path.split("/")[2]) def _doi_uri_to_doi(uri): # DOI URL to DOI translation is complicated. This is a cop-out. parsed = urlparse(uri) if 'doi.org' in parsed.netloc: doi = parsed.path.split("/", 1)[1] else: doi = None return doi def _url_request(url, requests_session=None, do_retries=False, **kwargs): if requests_session is None: sess = requests.Session() else: sess = requests_session if do_retries: retries = Retry() adapter = HTTPAdapter(max_retries=retries) sess.mount('http://', adapter) sess.mount('https://', adapter) if 'timeout' not in kwargs: kwargs['timeout'] = 1 try: resp = sess.get(url, **kwargs) if resp.status_code != 200: raise Exception(f'Service returned status code {resp.status_code}') content_type = resp.headers.get('content-type') if content_type: md = re.search("charset *= *([^ ]+)", content_type) if md: resp.charset = md.group(1) return resp except Exception: logger.error("Error in request for %s", url, exc_info=True) raise def _json_request(url, **kwargs): if 'headers' in kwargs: headers = kwargs['headers'] else: headers = {} kwargs['headers'] = headers headers['Accept'] = 'application/json' try: return _url_request(url, **kwargs).json() except BaseException: logger.warning("Couldn't retrieve JSON data from %s", url, exc_info=True) return {}