From a678f4a2b9bf57ff5f8bf15bdd998972d5a8d38e Mon Sep 17 00:00:00 2001
From: Renzo Frigato <rfrigato@stanford.edu>
Date: Mon, 29 Feb 2016 17:09:42 -0800
Subject: [PATCH] use schemas as they are served from the API

---
 api/api.py                       |  10 ++-
 api/config.py                    |  58 +++++++++++++
 api/download.py                  |   3 +-
 api/handlers/containerhandler.py |   7 +-
 api/handlers/grouphandler.py     |  13 ++-
 api/handlers/listhandler.py      |   8 +-
 api/handlers/schemahandler.py    |  21 +++++
 api/handlers/userhandler.py      |  13 ++-
 api/root.py                      |   3 +
 api/upload.py                    |   6 +-
 api/util.py                      |   7 ++
 api/validators.py                | 134 +++++++++++++------------------
 12 files changed, 187 insertions(+), 96 deletions(-)
 create mode 100644 api/handlers/schemahandler.py

diff --git a/api/api.py b/api/api.py
index 11589df1..9f029a00 100644
--- a/api/api.py
+++ b/api/api.py
@@ -17,6 +17,7 @@ from handlers import grouphandler
 from handlers import containerhandler
 from handlers import collectionshandler
 from handlers import searchhandler
+from handlers import schemahandler
 
 log = config.log
 
@@ -56,12 +57,14 @@ routing_regexes = {
     # any character allowed except '/''
     'tag_re': '[^/]{3,24}',
     # filename regex
-    # length between 3 and 60 characters
     # any character allowed except '/'
     'filename_re': '[^/]+',
     # note id regex
     # hexadecimal string exactly of length 24
-    'note_id_re': '[0-9a-f]{24}'
+    'note_id_re': '[0-9a-f]{24}',
+    # schema regex
+    # example: schema_path/schema.json
+    'schema_re': '[^/.]{3,60}/[^/.]{3,60}\.json'
 }
 
 def _format(route):
@@ -134,7 +137,8 @@ routes = [
     webapp2.Route(_format(r'/api/<par_cont_name:groups>/<par_id:{group_id_re}>/<cont_name:projects>'),          containerhandler.ContainerHandler, name='cont_sublist_groups', handler_method='get_all', methods=['GET']),
     webapp2.Route(_format(r'/api/<par_cont_name:{cont_name_re}>/<par_id:{cid_re}>/<cont_name:{cont_name_re}>'), containerhandler.ContainerHandler, name='cont_sublist', handler_method='get_all', methods=['GET']),
     webapp2.Route(_format(r'/api/search'),                                            searchhandler.SearchHandler, name='es_proxy', methods=['GET']),
-    webapp2.Route(_format(r'/api/search/<cont_name:{cont_name_re}>'),                 searchhandler.SearchHandler, name='es_proxy', methods=['GET']),
+    webapp2.Route(_format(r'/api/search/<cont_name:{cont_name_re}>'),                 searchhandler.SearchHandler, name='es_proxy_1', methods=['GET']),
+    webapp2.Route(_format(r'/api/schemas/<schema:{schema_re}>'),                      schemahandler.SchemaHandler, name='schemas', methods=['GET']),
 ]
 
 
diff --git a/api/config.py b/api/config.py
index 6ca252b2..ad3c92e3 100644
--- a/api/config.py
+++ b/api/config.py
@@ -1,5 +1,6 @@
 import os
 import copy
+import glob
 import logging
 import pymongo
 import datetime
@@ -46,6 +47,7 @@ DEFAULT_CONFIG = {
         'db_connect_timeout': '2000',
         'db_server_selection_timeout': '3000',
         'data_path': os.path.join(os.path.dirname(__file__), '../persistent/data'),
+        'schema_path': 'api/schemas',
         'elasticsearch_host': 'localhost:9200',
     }
 }
@@ -87,6 +89,62 @@ log.debug(str(db))
 
 es = elasticsearch.Elasticsearch([__config['persistent']['elasticsearch_host']])
 
+# validate the lists of json schemas
+schema_path = __config['persistent']['schema_path']
+
+expected_mongo_schemas = set([
+    'acquisition.json',
+    'collection.json',
+    'container.json',
+    'file.json',
+    'group.json',
+    'note.json',
+    'permission.json',
+    'project.json',
+    'session.json',
+    'subject.json',
+    'user.json',
+    'avatars.json',
+    'tag.json'
+])
+expected_input_schemas = set([
+    'acquisition.json',
+    'collection.json',
+    'container.json',
+    'file.json',
+    'group.json',
+    'note.json',
+    'packfile.json',
+    'permission.json',
+    'project.json',
+    'session.json',
+    'subject.json',
+    'user.json',
+    'avatars.json',
+    'download.json',
+    'tag.json',
+    'enginemetadata.json',
+    'uploader.json',
+    'reaper.json'
+])
+mongo_schemas = set()
+input_schemas = set()
+# validate and cache schemas at start time
+for schema_filepath in glob.glob(schema_path + '/mongo/*.json'):
+    schema_file = os.path.basename(schema_filepath)
+    mongo_schemas.add(schema_file)
+    with open(schema_filepath, 'rU') as f:
+        pass
+
+assert mongo_schemas == expected_mongo_schemas, '{} is different from {}'.format(mongo_schemas, expected_mongo_schemas)
+
+for schema_filepath in glob.glob(schema_path + '/input/*.json'):
+    schema_file = os.path.basename(schema_filepath)
+    input_schemas.add(schema_file)
+    with open(schema_filepath, 'rU') as f:
+        pass
+
+assert input_schemas == expected_input_schemas, '{} is different from {}'.format(input_schemas, expected_input_schemas)
 
 def initialize_db():
     log.info('Initializing database, creating indexes')
diff --git a/api/download.py b/api/download.py
index 6dcb7750..6955c742 100644
--- a/api/download.py
+++ b/api/download.py
@@ -198,7 +198,8 @@ class Download(base.RequestHandler):
                 config.db.projects.update_one({'_id': project_id}, {'$inc': {'counter': 1}})
         else:
             req_spec = self.request.json_body
-            validator = validators.payload_from_schema_file(self, 'download.json')
+            payload_schema_uri = util.schema_uri(self, 'input', 'download.json')
+            validator = validators.from_schema_path(payload_schema_uri)
             validator(req_spec, 'POST')
             log.debug(json.dumps(req_spec, sort_keys=True, indent=4, separators=(',', ': ')))
             return self._preflight_archivestream(req_spec)
diff --git a/api/handlers/containerhandler.py b/api/handlers/containerhandler.py
index 5112daee..5c789f78 100644
--- a/api/handlers/containerhandler.py
+++ b/api/handlers/containerhandler.py
@@ -306,8 +306,10 @@ class ContainerHandler(base.RequestHandler):
 
 
     def _get_validators(self):
-        mongo_validator = validators.mongo_from_schema_file(self.config.get('storage_schema_file'))
-        payload_validator = validators.payload_from_schema_file(self.config.get('payload_schema_file'))
+        mongo_schema_uri = util.schema_uri(self, 'mongo', self.config.get('storage_schema_file'))
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        payload_schema_uri = util.schema_uri(self, 'input', self.config.get('payload_schema_file'))
+        payload_validator = validators.from_schema_path(payload_schema_uri)
         return mongo_validator, payload_validator
 
     def _get_parent_container(self, payload):
@@ -329,7 +331,6 @@ class ContainerHandler(base.RequestHandler):
         log.debug(parent_container)
         return parent_container, parent_id_property
 
-
     def _get_container(self, _id):
         try:
             container = self.storage.get_container(_id)
diff --git a/api/handlers/grouphandler.py b/api/handlers/grouphandler.py
index 62344c1c..8e709e72 100644
--- a/api/handlers/grouphandler.py
+++ b/api/handlers/grouphandler.py
@@ -1,6 +1,7 @@
 import datetime
 
 from .. import base
+from .. import util
 from .. import config
 from .. import debuginfo
 from .. import validators
@@ -58,8 +59,10 @@ class GroupHandler(base.RequestHandler):
             self.abort(404, 'no such Group: ' + _id)
         permchecker = groupauth.default(self, group)
         payload = self.request.json_body
-        mongo_validator = validators.mongo_from_schema_file('group.json')
-        payload_validator = validators.payload_from_schema_file('group.json')
+        mongo_schema_uri = util.schema_uri(self, 'mongo', 'group.json')
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        payload_schema_uri = util.schema_uri(self, 'input', 'group.json')
+        payload_validator = validators.from_schema_path(payload_schema_uri)
         payload_validator(payload, 'PUT')
         result = mongo_validator(permchecker(self.storage.exec_op))('PUT', _id=_id, payload=payload)
         if result.modified_count == 1:
@@ -71,8 +74,10 @@ class GroupHandler(base.RequestHandler):
         self._init_storage()
         permchecker = groupauth.default(self, None)
         payload = self.request.json_body
-        mongo_validator = validators.mongo_from_schema_file('group.json')
-        payload_validator = validators.payload_from_schema_file('group.json')
+        mongo_schema_uri = util.schema_uri(self, 'mongo', 'group.json')
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        payload_schema_uri = util.schema_uri(self, 'input', 'group.json')
+        payload_validator = validators.from_schema_path(payload_schema_uri)
         payload_validator(payload, 'POST')
         payload['created'] = payload['modified'] = datetime.datetime.utcnow()
         payload['roles'] = [{'_id': self.uid, 'access': 'admin', 'site': self.user_site}] if self.uid else []
diff --git a/api/handlers/listhandler.py b/api/handlers/listhandler.py
index 9ac1181c..b9086002 100644
--- a/api/handlers/listhandler.py
+++ b/api/handlers/listhandler.py
@@ -183,9 +183,11 @@ class ListHandler(base.RequestHandler):
                 permchecker = permchecker(self, container)
         else:
             self.abort(404, 'Element {} not found in container {}'.format(_id, storage.cont_name))
-        mongo_validator = validators.mongo_from_schema_file(config.get('storage_schema_file'))
-        input_validator = validators.payload_from_schema_file(config.get('input_schema_file'))
-        keycheck = validators.key_check(config.get('storage_schema_file'))
+        mongo_schema_uri = util.schema_uri(self, 'mongo', config.get('storage_schema_file'))
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        input_schema_uri = util.schema_uri(self, 'input', config.get('input_schema_file'))
+        input_validator = validators.from_schema_path(input_schema_uri)
+        keycheck = validators.key_check(mongo_schema_uri)
         return container, permchecker, storage, mongo_validator, input_validator, keycheck
 
 
diff --git a/api/handlers/schemahandler.py b/api/handlers/schemahandler.py
new file mode 100644
index 00000000..18eb4a00
--- /dev/null
+++ b/api/handlers/schemahandler.py
@@ -0,0 +1,21 @@
+import os
+import json
+import datetime
+
+from .. import base
+from .. import config
+
+log = config.log
+
+class SchemaHandler(base.RequestHandler):
+
+    def __init__(self, request=None, response=None):
+        super(SchemaHandler, self).__init__(request, response)
+
+    def get(self, schema, **kwargs):
+        schema_path = os.path.join(config.get_item('persistent', 'schema_path'), schema)
+        try:
+            with open(schema_path, 'ru') as f:
+                return json.load(f)
+        except IOError as e:
+            self.abort(404, str(e))
diff --git a/api/handlers/userhandler.py b/api/handlers/userhandler.py
index 42b6e65e..ca36fe0d 100644
--- a/api/handlers/userhandler.py
+++ b/api/handlers/userhandler.py
@@ -3,6 +3,7 @@ import datetime
 import requests
 
 from .. import base
+from .. import util
 from .. import config
 from .. import validators
 from ..auth import userauth, always_ok, ROLES
@@ -64,8 +65,10 @@ class UserHandler(base.RequestHandler):
         user = self._get_user(_id)
         permchecker = userauth.default(self, user)
         payload = self.request.json_body
-        mongo_validator = validators.mongo_from_schema_file('user.json')
-        payload_validator = validators.payload_from_schema_file('user.json')
+        mongo_schema_uri = util.schema_uri(self, 'mongo', 'user.json')
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        payload_schema_uri = util.schema_uri(self, 'input', 'user.json')
+        payload_validator = validators.from_schema_path(payload_schema_uri)
         payload_validator(payload, 'PUT')
         payload['modified'] = datetime.datetime.utcnow()
         result = mongo_validator(permchecker(self.storage.exec_op))('PUT', _id=_id, payload=payload)
@@ -78,8 +81,10 @@ class UserHandler(base.RequestHandler):
         self._init_storage()
         permchecker = userauth.default(self)
         payload = self.request.json_body
-        mongo_validator = validators.mongo_from_schema_file('user.json')
-        payload_validator = validators.payload_from_schema_file('user.json')
+        mongo_schema_uri = util.schema_uri(self, 'mongo', 'user.json')
+        mongo_validator = validators.decorator_from_schema_path(mongo_schema_uri)
+        payload_schema_uri = util.schema_uri(self, 'input', 'user.json')
+        payload_validator = validators.from_schema_path(payload_schema_uri)
         payload_validator(payload, 'POST')
         payload['created'] = payload['modified'] = datetime.datetime.utcnow()
         payload['root'] = payload.get('root', False)
diff --git a/api/root.py b/api/root.py
index c98063e5..cbcc9b66 100644
--- a/api/root.py
+++ b/api/root.py
@@ -2,6 +2,9 @@ import re
 import markdown
 
 from . import base
+from . import config
+
+log = config.log
 
 class Root(base.RequestHandler):
 
diff --git a/api/upload.py b/api/upload.py
index f52cfc04..74e584b3 100644
--- a/api/upload.py
+++ b/api/upload.py
@@ -170,7 +170,8 @@ class Upload(base.RequestHandler):
                 self.abort(400, str(e))
             if not file_store.metadata:
                 self.abort(400, 'metadata is missing')
-            metadata_validator = validators.payload_from_schema_file('uploader.json')
+            payload_schema_uri = util.schema_uri(self, 'input', 'uploader.json')
+            metadata_validator = validators.from_schema_path(payload_schema_uri)
             metadata_validator(file_store.metadata, 'POST')
             try:
                 target_containers = reaperutil.create_root_to_leaf_hierarchy(file_store.metadata, file_store.files)
@@ -216,7 +217,8 @@ class Upload(base.RequestHandler):
                 self.abort(400, str(e))
             if not file_store.metadata:
                 self.abort(400, 'metadata is missing')
-            metadata_validator = validators.payload_from_schema_file('enginemetadata.json')
+            payload_schema_uri = util.schema_uri(self, 'input', 'enginemetadata.json')
+            metadata_validator = validators.from_schema_path(payload_schema_uri)
             metadata_validator(file_store.metadata, 'POST')
             file_infos = file_store.metadata['acquisition'].pop('files', [])
             now = datetime.datetime.utcnow()
diff --git a/api/util.py b/api/util.py
index f553ba1f..1d3f927a 100644
--- a/api/util.py
+++ b/api/util.py
@@ -136,6 +136,13 @@ def send_json_http_exception(response, message, code):
     response.headers['Content-Type'] = 'application/json; charset=utf-8'
     response.write(content)
 
+def schema_uri(handler, type_, schema_name):
+    return handler.uri_for(
+        'schemas',
+        schema=type_ + '/' + schema_name,
+        _full=True
+    )
+
 class Enum(baseEnum.Enum):
     # Enum strings are prefixed by their class: "Category.classifier".
     # This overrides that behaviour and removes the prefix.
diff --git a/api/validators.py b/api/validators.py
index d73f56b3..aaf223c4 100644
--- a/api/validators.py
+++ b/api/validators.py
@@ -1,7 +1,10 @@
 import os
+import re
 import copy
-import glob
+import json
+import requests
 import jsonschema
+from jsonschema.compat import urlopen, urlsplit
 
 from . import config
 
@@ -13,91 +16,69 @@ class InputValidationException(Exception):
 class DBValidationException(Exception):
     pass
 
-# following https://github.com/Julian/jsonschema/issues/98
-# json schema files are expected to be in the schemas folder relative to this module
-schema_path = os.path.abspath(os.path.dirname(__file__))
-
-resolver_input = jsonschema.RefResolver('file://' + schema_path + '/schemas/input/', None)
-resolver_mongo = jsonschema.RefResolver('file://' + schema_path + '/schemas/mongo/', None)
-
-expected_mongo_schemas = set([
-    'acquisition.json',
-    'collection.json',
-    'container.json',
-    'file.json',
-    'group.json',
-    'note.json',
-    'permission.json',
-    'project.json',
-    'session.json',
-    'subject.json',
-    'user.json',
-    'avatars.json',
-    'tag.json'
-])
-expected_input_schemas = set([
-    'acquisition.json',
-    'collection.json',
-    'container.json',
-    'file.json',
-    'group.json',
-    'note.json',
-    'permission.json',
-    'project.json',
-    'session.json',
-    'subject.json',
-    'user.json',
-    'avatars.json',
-    'download.json',
-    'tag.json',
-    'enginemetadata.json',
-    'packfile.json',
-    'uploader.json',
-    'reaper.json'
-])
-mongo_schemas = set()
-input_schemas = set()
-# validate and cache schemas at start time
-for schema_filepath in glob.glob(schema_path + '/schemas/mongo/*.json'):
-    schema_file = os.path.basename(schema_filepath)
-    mongo_schemas.add(schema_file)
-    resolver_mongo.resolve(schema_file)
-
-assert mongo_schemas == expected_mongo_schemas, '{} is different from {}'.format(mongo_schemas, expected_mongo_schemas)
-
-for schema_filepath in glob.glob(schema_path + '/schemas/input/*.json'):
-    schema_file = os.path.basename(schema_filepath)
-    input_schemas.add(schema_file)
-    resolver_input.resolve(schema_file)
-
-assert input_schemas == expected_input_schemas, '{} is different from {}'.format(input_schemas, expected_input_schemas)
-
-def validate_data(data, schema_name, verb, optional=False):
+def validate_data(data, schema_url, verb, optional=False):
     """
     Convenience method to validate a JSON schema against some action.
 
     If optional is set, validate_data won't complain about null data.
     """
+    raise NotImplementedError('this needs to be adapted, using the new schema endpoint')
 
     if optional and data is None:
         return
 
-    validator = payload_from_schema_file(schema_name)
+    validator = from_schema_path(schema_url)
     validator(data, verb)
 
 def _validate_json(json_data, schema, resolver):
     jsonschema.validate(json_data, schema, resolver=resolver)
-    #jsonschema.Draft4Validator(schema, resolver=resolver).validate(json_data)
+
+class RefResolver(jsonschema.RefResolver):
+
+    def resolve_remote(self, uri):
+        """override default resolve_remote
+        to allow testing then there is no ssl certificate
+        """
+        scheme = urlsplit(uri).scheme
+
+        if scheme in self.handlers:
+            result = self.handlers[scheme](uri)
+        elif (
+            scheme in [u"http", u"https"] and
+            requests and
+            getattr(requests.Response, "json", None) is not None
+        ):
+            # Requests has support for detecting the correct encoding of
+            # json over http
+            if callable(requests.Response.json):
+                result = requests.get(uri, verify=False).json()
+            else:
+                result = requests.get(uri, verify=False).json
+        else:
+            # Otherwise, pass off to urllib and assume utf-8
+            result = json.loads(urlopen(uri).read().decode("utf-8"))
+
+        if self.cache_remote:
+            self.store[uri] = result
+        return result
+
+# We store the resolvers for each base_uri we use, so that we reuse the schemas cached by the resolvers.
+resolvers = {}
+def _resolve_schema(schema_url):
+    base_uri, schema_name = re.match('(.*/)(.*)', schema_url).groups()
+    if not resolvers.get(base_uri):
+        resolvers[base_uri] = RefResolver(base_uri, None)
+    return resolvers[base_uri].resolve(schema_name)[1], resolvers[base_uri]
 
 def no_op(g, *args):
     return g
 
-def mongo_from_schema_file(schema_file):
-    if schema_file is None:
+def decorator_from_schema_path(schema_url):
+    if schema_url is None:
         return no_op
-    schema = resolver_mongo.resolve(schema_file)[1]
+    schema, resolver = _resolve_schema(schema_url)
     def g(exec_op):
-        def mongo_val(method, **kwargs):
+        def validator(method, **kwargs):
             payload = kwargs['payload']
             log.debug(payload)
             if method == 'PUT' and schema.get('required'):
@@ -107,17 +88,18 @@ def mongo_from_schema_file(schema_file):
                 _schema = schema
             if method in ['POST', 'PUT']:
                 try:
-                    _validate_json(payload, _schema, resolver_mongo)
+                    _validate_json(payload, _schema, resolver)
                 except jsonschema.ValidationError as e:
                     raise DBValidationException(str(e))
             return exec_op(method, **kwargs)
-        return mongo_val
+        return validator
     return g
 
-def payload_from_schema_file(schema_file):
-    if schema_file is None:
+def from_schema_path(schema_url):
+    if schema_url is None:
         return no_op
-    schema = resolver_input.resolve(schema_file)[1]
+    # split the url in base_uri and schema_name
+    schema, resolver = _resolve_schema(schema_url)
     def g(payload, method):
         if method == 'PUT' and schema.get('required'):
             _schema = copy.copy(schema)
@@ -126,12 +108,12 @@ def payload_from_schema_file(schema_file):
             _schema = schema
         if method in ['POST', 'PUT']:
             try:
-                _validate_json(payload, _schema, resolver_input)
+                _validate_json(payload, _schema, resolver)
             except jsonschema.ValidationError as e:
                 raise InputValidationException(str(e))
     return g
 
-def key_check(schema_file):
+def key_check(schema_url):
     """
     for sublists of mongo container there is no automatic key check when creating, updating or deleting an object.
     We are adding a custom array field to the json schemas ("key_fields").
@@ -146,9 +128,9 @@ def key_check(schema_file):
     2. a GET will retrieve a single item
     3. a DELETE (most importantly) will delete a single item
     """
-    if schema_file is None:
+    if schema_url is None:
         return no_op
-    schema = resolver_mongo.resolve(schema_file)[1]
+    schema, _ = _resolve_schema(schema_url)
     log.debug(schema)
     if schema.get('key_fields') is None:
         return no_op
-- 
GitLab