diff --git a/.gitignore b/.gitignore index c78c9b1..f896f20 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ parts *.pyo *.tmp* .DS_Store -~* \ No newline at end of file +~* +config_files \ No newline at end of file diff --git a/datapump/oaipmh.py b/datapump/oaipmh.py old mode 100644 new mode 100755 index 844dce2..e421e05 --- a/datapump/oaipmh.py +++ b/datapump/oaipmh.py @@ -172,6 +172,66 @@ def format(self, record): return (repo_id, doc) +class CommPara(OAIDC): + def __init__(self, identity, config, namespaces): + OAIDC.__init__(self, identity, config, namespaces) + + def format(self, record): + doc = self.get_doc_template() + resource_locator = record.xpath("oai:metadata/comm_para:commParadata/comm_para:usageDataResourceURL/text()", namespaces=self.namespaces) + + if resource_locator == None or len(resource_locator) == 0: + return (None, None) + + try: + (scheme, netloc, _, _, _, _) = urlparse(resource_locator[0]) + if scheme == '' or netloc == '': + return (None, None) + except: + log.exception("Not a URL: %s", resource_locator[0]) + return (None, None) + + try: + repo_id = record.xpath("oai:header/oai:identifier[1]/text()", namespaces=self.namespaces)[0] + except: + repo_id = None + + + payload = record.xpath("oai:metadata/comm_para:commParadata", namespaces=self.namespaces) + collection = record.xpath("oai:header/oai:setSpec[1]/text()", namespaces=self.namespaces) + schemaLocation = record.xpath("oai:metadata/comm_para:commParadata/@xsi:schemaLocation", namespaces=self.namespaces) + + doc["resource_locator"] = resource_locator[0].strip() + + #comm_para doesn't really include subject or edLevel + #subject = record.xpath("oai:metadata/commParadata/dc:subject/text()", namespaces=self.namespaces) + #language = record.xpath("oai:metadata/commParadata/paradataTitle/@language", namespaces=self.namespaces) + #edLevel = record.xpath("oai:metadata/commParadata/dct:educationLevel/text()", namespaces=self.namespaces) + #doc["keys"].extend(map(lambda x: x.strip(), subject)) + #doc["keys"].extend(map(lambda x: x.strip(), language)) + #doc["keys"].extend(map(lambda x: x.strip(), edLevel)) + + doc["keys"].extend(map(lambda x: x.strip(), collection)) + + doc = self._setLRTestData(doc) + doc["keys"] = self._unique(doc["keys"]) + + doc["payload_schema"].append("comm_para") + doc["payload_schema_locator"] = schemaLocation[0].strip() + + doc["payload_placement"] = "inline" + doc["resource_data"] = etree.tostring(payload[0]).strip() + + for key in doc.keys(): + if (doc[key] == None): + del doc[key] + + # signer has a problem with encoding descendents of string type + doc = eval(repr(doc)) + + return (repo_id, doc) + + class Fetcher(): def __init__(self, namespaces=None, conf=None): self.WAIT_DEFAULT = 120 # two minutes @@ -182,6 +242,7 @@ def __init__(self, namespaces=None, conf=None): "dc":"http://purl.org/dc/elements/1.1/", "dct":"http://purl.org/dc/terms/", "nsdl_dc":"http://ns.nsdl.org/nsdl_dc_v1.02/", + "comm_para":"http://ns.nsdl.org/ncs/comm_para", "ieee":"http://www.ieee.org/xsd/LOMv1p0", "xsi":"http://www.w3.org/2001/XMLSchema-instance" } diff --git a/datapump/run.py b/datapump/run.py index 90ed8c4..de5e98f 100644 --- a/datapump/run.py +++ b/datapump/run.py @@ -14,7 +14,7 @@ from __future__ import division from LRSignature.sign import Sign from datapump.couchdb import CouchDB -from datapump.oaipmh import NSDL, OAIDC +from datapump.oaipmh import NSDL, OAIDC, CommPara from datetime import datetime from filelock.filelock import FileLock, FileLockException from urllib2 import HTTPError @@ -114,6 +114,7 @@ def __init__(self): "dc":"http://purl.org/dc/elements/1.1/", "dct":"http://purl.org/dc/terms/", "nsdl_dc":"http://ns.nsdl.org/nsdl_dc_v1.02/", + "comm_para":"http://ns.nsdl.org/ncs/comm_para", "ieee":"http://www.ieee.org/xsd/LOMv1p0", "xsi":"http://www.w3.org/2001/XMLSchema-instance" } @@ -210,6 +211,8 @@ def __init__(self, opts=Opts()): if self.opts.settings["config"]["metadataPrefix"] == "nsdl_dc": col_names = self.fetcher.fetchCollections() self.transformer = NSDL(identity=self.identity, config=self.config, namespaces=self.namespaces, col_map=col_names) + elif self.opts.settings["config"]["metadataPrefix"] == "comm_para": + self.transformer = CommPara(identity=self.identity, config=self.config, namespaces=self.namespaces) elif self.opts.settings["config"]["metadataPrefix"] == "oai_dc": self.transformer = OAIDC(identity=self.identity, config=self.config, namespaces=self.namespaces) else: @@ -343,6 +346,7 @@ def publishToNode(self, force=False): log.error("REPOID:{repoid} DOCID:{docid} ERROR: {msg}".format(repoid=repo_id, docid=result["doc_ID"], msg=result["error"])) else: published = True + log.info("Published doc id : %s", result["doc_ID"]) self.couch.saw(repo_id, published) pubcount = numDocs - nonpubcount