from six.moves.urllib.parse import urlparse, urlencode
from six.moves.urllib.request import Request, urlopen
from six.moves.urllib.error import HTTPError, URLError
import re
import logging
from owmeta_core.graph_object import IdentifierMissingException
from owmeta_core.context import Context
from owmeta_core.dataobject import DataObject, DatatypeProperty, Alias
from . import SCI_CTX
from . import bibtex as BIB
logger = logging.getLogger(__name__)
[docs]class WormbaseRetrievalException(Exception):
pass
[docs]class PubmedRetrievalException(Exception):
pass
# A little bit about why this a separate type from Document:
#
# This type corresponds to a document which has some statements that we care
# about. The key reason this is distinct from Document is that a document need
# not provide evidence of anything. For example, the `WormData.n4` file
# generated by insert_worm.py is a document, but it doesn't provide any
# scientific or logical justification for any of the statements made within it.
[docs]class BaseDocument(DataObject):
class_context = SCI_CTX
def make_context_identifier(self):
return self.make_identifier(self.identifier)
@property
def as_context(self):
if self.context is not None:
return Context.contextualize(self.context)(ident=self.make_context_identifier())
else:
return Context(ident=self.make_context_identifier())
[docs]class Document(BaseDocument):
"""
A representation of some document.
Possible keys include::
pmid, pubmed: a pubmed id or url (e.g., 24098140)
wbid, wormbase: a wormbase id or url (e.g., WBPaper00044287)
doi: a Digitial Object id or url (e.g., s00454-010-9273-0)
uri: a URI specific to the document, preferably usable for accessing
the document
"""
class_context = SCI_CTX
author = DatatypeProperty(multiple=True)
''' An author of the document '''
doi = DatatypeProperty()
''' A Digital Object Identifier (DOI), optional '''
uri = DatatypeProperty(multiple=True)
''' A non-standard URI for the document '''
wbid = DatatypeProperty()
''' An ID from WormBase.org that points to a record, optional '''
wormbaseid = Alias(wbid)
''' An alias to `wbid` '''
pmid = DatatypeProperty()
''' A PubMed ID (PMID) that points to a paper '''
year = DatatypeProperty()
''' The year (e.g., publication year) of the document '''
date = Alias(year)
''' Alias to year '''
title = DatatypeProperty()
''' The title of the document '''
def __init__(
self,
bibtex=None,
doi=None,
pubmed=None,
wormbase=None,
**kwargs):
"""
Parameters
----------
bibtex : string
A string containing a single BibTeX entry. Parsed during initialization, but not saved thereafter. optional
doi : string
A Digital Object Identifier (DOI). optional
pubmed : string
A PubMed ID (PMID) or URL that points to a paper. Ignored if 'pmid' is provided. optional
wormbase : string
An ID or URL from WormBase that points to a record. Ignored if `wbid` or `wormbaseid` are provided. optional
"""
super(Document, self).__init__(**kwargs)
self.id_precedence = ('doi', 'pmid', 'wbid', 'uri')
if bibtex is not None:
self.update_with_bibtex(bibtex)
if pubmed is not None and not self.pmid.has_defined_value():
if pubmed[:4] == 'http':
_tmp = _pubmed_uri_to_pmid(pubmed)
if _tmp is None:
raise ValueError("Couldn't convert Pubmed URL to a PubMed ID")
pmid = _tmp
else:
pmid = pubmed
self.pmid.set(pmid)
if wormbase is not None and not self.wbid.has_defined_value():
if wormbase[:4] == 'http':
_tmp = _wormbase_uri_to_wbid(wormbase)
if _tmp is None:
raise ValueError("Couldn't convert Wormbase URL to a Wormbase ID")
wbid = _tmp
else:
wbid = wormbase
self.wbid.set(wbid)
if doi is not None:
if doi[:4] == 'http':
_tmp = _doi_uri_to_doi(doi)
if _tmp is not None:
doi = _tmp
self.doi.set(doi)
def update_with_bibtex(self, bibtex):
bib_db = BIB.loads(bibtex)
if len(bib_db.entries) > 1:
raise ValueError('The given BibTex string has %d entries.'
' Cannot determine which entry to use for the document' % len(bib_db))
BIB.update_document_with_bibtex(self, bib_db.entries[0])
[docs] def defined_augment(self):
for x in self.id_precedence:
if getattr(self, x).has_defined_value():
return True
return False
[docs] def identifier_augment(self):
for idKind in self.id_precedence:
idprop = getattr(self, idKind)
if idprop.has_defined_value():
s = str(idKind) + ":" + idprop.defined_values[0].identifier.n3()
return self.make_identifier(s)
raise IdentifierMissingException(self)
# TODO: Provide a way to override modification of already set values.
[docs] def update_from_wormbase(self, replace_existing=False):
""" Queries wormbase for additional data to fill in the Document.
If replace_existing is set to `True`, then existing values will be cleared.
"""
# XXX: wormbase's REST API is pretty sparse in terms of data provided.
# Would be better off using AQL or the perl interface
# _Very_ few of these have these fields filled in
wbid = self.wbid.defined_values
if len(wbid) == 1:
wbid = wbid[0].identifier.toPython()
# get the author
try:
root = self.conf.get('wormbase_api_root_url', 'http://rest.wormbase.org')
url = root + '/rest/widget/paper/' + str(wbid) + '/overview?content-type=application%2Fjson'
j = _json_request(url)
if 'fields' in j:
f = j['fields']
if 'authors' in f:
dat = f['authors']['data']
if dat is not None:
if replace_existing and self.author.has_defined_value:
self.author.clear()
for x in dat:
self.author.set(x['label'])
for fname in ('pmid', 'year', 'title', 'doi'):
if fname in f and f[fname]['data'] is not None:
attr = getattr(self, fname)
if replace_existing and attr.has_defined_value:
attr.clear()
attr.set(f[fname]['data'])
except Exception:
logger.warning("Couldn't retrieve Wormbase data", exc_info=True)
elif len(wbid) == 0:
raise WormbaseRetrievalException("There is no Wormbase ID attached to this Document."
" So no data can be retrieved")
else:
raise WormbaseRetrievalException("There is more than one Wormbase ID attached to this Document."
" Please try with just one Wormbase ID")
def _crossref_doi_extract(self):
# Extract data from crossref
def crRequest(doi):
data = {'q': doi}
data_encoded = urlencode(data)
return _json_request(
'http://search.labs.crossref.org/dois?%s' %
data_encoded)
doi = self.doi()
if doi[:4] == 'http':
doi = _doi_uri_to_doi(doi)
try:
r = crRequest(doi)
except Exception:
logger.warning("Couldn't retrieve Crossref info", exc_info=True)
return
# XXX: I don't think coins is meant to be used, but it has structured
# data...
if len(r) > 0:
extra_data = r[0]['coins'].split('&')
fields = (x.split("=") for x in extra_data)
fields = [[y.replace('+', ' ').strip() for y in x] for x in fields]
authors = [x[1] for x in fields if x[0] == 'rft.au']
for a in authors:
self.author(a)
# no error for bad ids, just an empty list
if len(r) > 0:
# Crossref can process multiple doi's at one go and return the
# metadata. we just need the first one
r = r[0]
if 'title' in r:
self.title(r['title'])
if 'year' in r:
self.year(r['year'])
def update_from_pubmed(self):
def pmRequest(pmid):
import xml.etree.ElementTree as ET # Python 2.5 and up
url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=' + str(pmid)
key = self.get('pubmed.api_key', None)
if key:
url += '&api_key=' + key
else:
logger.warning("PubMed API key not defined. API calls will be limited.")
s = _url_request(url)
if hasattr(s, 'charset'):
parser = ET.XMLParser(encoding=s.charset)
else:
parser = None
return ET.parse(s, parser)
pmid = self.pmid.defined_values
if len(pmid) == 1:
pmid = pmid[0].identifier.toPython()
try:
tree = pmRequest(pmid)
except Exception:
logger.warning("Couldn't retrieve Pubmed info", exc_info=True)
return
for x in tree.findall('./DocSum/Item[@Name="AuthorList"]/Item'):
self.author(x.text)
for x in tree.findall('./DocSum/Item[@Name="Title"]'):
self.title(x.text)
for x in tree.findall('./DocSum/Item[@Name="DOI"]'):
self.doi(x.text)
for x in tree.findall('./DocSum/Item[@Name="PubDate"]'):
self.year(x.text)
elif len(pmid) == 0:
raise PubmedRetrievalException('No Pubmed ID is attached to this document. Cannot retrieve Pubmed data')
else:
raise PubmedRetrievalException('More than one Pubmed ID is attached to this document.'
' Please try with just one Pubmed ID')
def _wormbase_uri_to_wbid(uri):
return str(urlparse(uri).path.split("/")[2])
def _pubmed_uri_to_pmid(uri):
return str(urlparse(uri).path.split("/")[2])
def _doi_uri_to_doi(uri):
# DOI URL to DOI translation is complicated. This is a cop-out.
parsed = urlparse(uri)
if 'doi.org' in parsed.netloc:
doi = parsed.path.split("/", 1)[1]
else:
doi = None
return doi
class EmptyRes(object):
def read(self):
return bytes()
def _url_request(url, headers={}):
try:
r = Request(url, headers=headers)
s = urlopen(r, timeout=1)
info = dict(s.info())
content_type = {k.lower(): info[k] for k in info}['content-type']
md = re.search("charset *= *([^ ]+)", content_type)
if md:
s.charset = md.group(1)
return s
except HTTPError:
logger.error("Error in request for {}".format(url), exc_info=True)
return EmptyRes()
except URLError:
logger.error("Error in request for {}".format(url), exc_info=True)
return EmptyRes()
def _json_request(url):
import json
headers = {'Accept': 'application/json'}
try:
data = _url_request(url, headers).read().decode('UTF-8')
if hasattr(data, 'charset'):
return json.loads(data, encoding=data.charset)
else:
return json.loads(data)
except BaseException:
logger.warning("Couldn't retrieve JSON data from " + url,
exc_info=True)
return {}