containers.py 15.26 KiB
# @author: Gunnar Schaefer, Kevin S. Hahn
import logging
log = logging.getLogger('scitran.api')
import os
import json
import hashlib
import datetime
import jsonschema
import bson.json_util
import tempdir as tempfile
import base
import util
import users
import tempdir as tempfile
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': 'File',
'type': 'object',
'properties': {
'name': {
'title': 'Name',
'type': 'string',
'ext': {
'title': 'Extension',
'type': 'string',
'size': {
'title': 'Size',
'type': 'integer',
'sha1': {
'title': 'SHA-1',
'type': 'string',
'type': {
'title': 'Type',
'type': 'string',
'kinds': {
'title': 'Kinds',
'type': 'array',
'state': {
'title': 'State',
'type': 'array',
'required': ['name', 'ext', 'size', 'sha1', 'type', 'kinds', 'state'],
'additionalProperties': False
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': 'Upload',
'type': 'object',
'properties': {
'files': {
'title': 'Files',
'type': 'array',
'items': FILE_SCHEMA,
'uniqueItems': True,
'required': ['files'],
'additionalProperties': False,
'$schema': 'http://json-schema.org/draft-04/schema#',
'title': 'File Download',
'anyOf': [
'type': 'object',
'properties': {
'type': {
'title': 'Type',
'type': 'string',
'kinds': {
'title': 'Kinds',
'type': 'array',
'state': {
'title': 'State',
'type': 'array',
'required': ['type', 'kinds', 'state'],
'type': 'object',
'properties': {
'name': {
'title': 'Type',
'type': 'string',
'ext': {
'title': 'Type',
'type': 'string',
'required': ['name', 'ext'],
class ContainerList(base.RequestHandler):
def _get(self, query, projection, admin_only=False):
if self.public_request:
query['public'] = True
projection['permissions'] = {'$elemMatch': {'_id': self.uid, 'site': self.source_site}}
if not self.superuser_request:
if admin_only:
query['permissions'] = {'$elemMatch': {'_id': self.uid, 'site': self.source_site, 'access': 'admin'}}
query['permissions'] = {'$elemMatch': {'_id': self.uid, 'site': self.source_site}}
containers = list(self.dbc.find(query, projection))
for container in containers:
container['_id'] = str(container['_id'])
return containers
class Container(base.RequestHandler):
def _get(self, _id, min_role=None, perm_only=False, dbc=None, dbc_name=None):
dbc = dbc or self.dbc
dbc_name = dbc_name or self.__class__.__name__
container = dbc.find_one({'_id': _id}, ['permissions'] if perm_only else None)
if not container:
self.abort(404, 'no such ' + dbc_name)
user_perm = util.user_perm(container['permissions'], self.uid, self.source_site)
if self.public_request:
if not container.get('public', False):
self.abort(403, 'this ' + dbc_name + 'is not public')
del container['permissions']
elif not self.superuser_request:
if not user_perm:
self.abort(403, self.uid + ' does not have permissions on this ' + dbc_name)
if min_role and users.INTEGER_ROLES[user_perm['access']] < users.INTEGER_ROLES[min_role]:
self.abort(403, self.uid + ' does not have at least ' + min_role + ' permissions on this ' + dbc_name)
if user_perm['access'] != 'admin': # if not admin, mask permissions of other users
container['permissions'] = [user_perm]
if self.request.get('paths').lower() in ('1', 'true'):
for file_info in container['files']:
file_info['path'] = str(_id)[-3:] + '/' + str(_id) + '/' + file_info['name'] + file_info['ext']
container['_id'] = str(container['_id'])
return container, user_perm
def put(self, _id):
json_body = self.validate_json_body(_id, ['project'])
self._get(_id, 'admin' if 'permissions' in json_body else 'rw', perm_only=True)
self.update_db(_id, json_body)
return json_body
def validate_json_body(self, _id, oid_keys=[]):
json_body = self.request.json_body
jsonschema.validate(json_body, self.put_schema)
except (ValueError, jsonschema.ValidationError) as e:
self.abort(400, str(e))
if 'permissions' in json_body and json_body['permissions'] is None:
for key in oid_keys:
if key in json_body:
json_body[key] = bson.ObjectId(json_body[key])
return json_body
def update_db(self, _id, json_body):
self.dbc.update({'_id': _id}, {'$set': util.mongo_dict(json_body)})
def get_file(self, cid):
file_spec = self.request.json_body
jsonschema.validate(file_spec, FILE_DOWNLOAD_SCHEMA)
except (ValueError, jsonschema.ValidationError) as e:
self.abort(400, str(e))
_id = bson.ObjectId(cid)
container, _ = self._get(_id, 'ro')
for file_info in container['files']:
if 'name' in file_spec:
if file_info['name'] == file_spec['name'] and file_info['ext'] == file_spec['ext']:
if file_info['type'] == file_spec['type'] and file_info['kinds'] == file_spec['kinds'] and file_info['state'] == file_spec['state']:
self.abort(404, 'no such file')
filename = file_info['name'] + file_info['ext']
filepath = os.path.join(self.app.config['data_path'], str(_id)[-3:] + '/' + str(_id), filename)
if self.request.method == 'GET':
self.response.app_iter = open(filepath, 'rb')
self.response.headers['Content-Length'] = str(file_info['size']) # must be set after setting app_iter
self.response.headers['Content-Type'] = 'application/octet-stream'
ticket = util.download_ticket('single', filepath, filename, file_info['size'])
tkt_id = self.app.db.downloads.insert(ticket)
return {'url': self.uri_for('download', _full=True, ticket=tkt_id)}
def put_file(self, cid=None):
Receive a targeted processor or user upload.
Accepts a multipart request that contains json in first part, and data in second part.
This POST route is used to add a file to an existing container, not for creating new containers.
This upload is different from the main PUT route, because this does not update the primary
metadata, nor does it try to determine where to place the file. It always gets placed in
the current container.
# TODO; revise how engine's upload their data to be compatible with the put_attachment fxn
def receive_stream_and_validate(stream, digest, filename):
# FIXME pull this out to also be used from core.Core.put() and also replace the duplicated code below
hash_ = hashlib.sha1()
filepath = os.path.join(tempdir_path, filename)
with open(filepath, 'wb') as fd:
for chunk in iter(lambda: stream.read(2**20), ''):
if hash_.hexdigest() != digest:
self.abort(400, 'Content-MD5 mismatch.')
return filepath
if cid is None: # sortable user upload
else: # targeted upload
if self.request.content_type != 'multipart/form-data': # do not accept the OTHER sort of multipart
self.abort(400, 'content-type must be "multipart/form-data"')
metadata = json.loads(self.request.get('metadata'))
jsonschema.validate(metadata, FILE_UPLOAD_SCHEMA)
except (ValueError, jsonschema.ValidationError) as e:
self.abort(400, str(e))
if self.public_request: # processor upload
_id = None
# FIXME check that processor is legit
elif cid is not None: # targeted user upload
_id = bson.ObjectId(cid)
container, _ = self._get(_id, 'rw')
else: # sortable user upload
# FIXME: pre-parse file, reject if unparsable
data_path = self.app.config['data_path']
quarantine_path = self.app.config['quarantine_path']
with tempfile.TemporaryDirectory(prefix='.tmp', dir=data_path) as tempdir_path:
for file_info in metadata['files']:
hash_ = hashlib.sha1()
filename = file_info['name'] + file_info['ext']
filepath = os.path.join(tempdir_path, filename)
field_storage_obj = self.request.POST.get(filename)
with open(filepaths[-1], 'wb') as fd:
for chunk in iter(lambda: field_storage_obj.file.read(2**20), ''):
if hash_.hexdigest() != file_info['sha1']:
self.abort(400, 'Content-MD5 mismatch.')
log.info('Received %s [%s] from %s' % (filename, util.hrsize(file_info['size']), self.request.user_agent)) # FIXME: user_agent or uid
status, detail = util.insert_file(self.dbc, _id, file_info, filepath, file_info['sha1'], data_path, quarantine_path)
if status != 200:
self.abort(status, detail)
def put_attachment(self, cid):
Recieve a targetted user upload of an attachment.
Attachments are different from files, in that they are not 'research ready'. Attachments
represent other documents that are generally not useable by the engine; documents like
consent forms, pen/paper questionnaires, study recruiting materials, etc.
Internally, attachments are distinguished from files because of what metadata is
required. Attachments really only need a 'kinds' and 'type'. We don't expect iteration over
an attachment in a way that would require tracking 'state'.
# TODO read self.request.body, using '------WebKitFormBoundary' as divider
# first line is 'content-disposition' line, extract filename
# second line is content-type, determine how to write to a file, as bytes or as string
# third linedata_path = self.app.config['data_path'], just a separator, useless
if self.request.content_type != 'multipart/form-data':
self.abort(400, 'content-type must be "multipart/form-data"')
# TODO: metadata validation
_id = bson.ObjectId(cid)
container, _ = self._get(_id, 'rw')
data_path = self.app.config['data_path']
quarantine_path = self.app.config['quarantine_path']
hashes = []
with tempfile.TemporaryDirectory(prefix='.tmp', dir=self.app.config['data_path']) as tempdir_path:
# get and hash the metadata
metahash = hashlib.sha1()
metastr = self.request.POST.get('metadata').file.read() # returns a string?
metadata = json.loads(metastr)
hashes.append({'name': 'metadata', 'sha1': metahash.hexdigest()})
sha1s = json.loads(self.request.POST.get('sha').file.read())
for finfo in metadata:
fname = finfo.get('name') + finfo.get('ext') # finfo['ext'] will always be empty
fhash = hashlib.sha1()
fobj = self.request.POST.get(fname).file
filepath = os.path.join(tempdir_path, fname)
with open(filepath, 'wb') as fd:
for chunk in iter(lambda: fobj.read(2**20), ''):
for s in sha1s:
if fname == s.get('name'):
if fhash.hexdigest() != s.get('sha1'):
self.abort(400, 'Content-MD5 mismatch %s vs %s' % (fhash.hexdigest(), s.get('sha1')))
finfo['sha1'] = s.get('sha1')
status, detail = util.insert_file(self.dbc, _id, finfo, filepath, s.get('sha1'), data_path, quarantine_path, flavor='attachment')
if status != 200:
self.abort(400, 'upload failed')
self.abort(400, '%s is not listed in the sha1s' % fname)
def get_attachment(self, cid):
"""Download one attachment."""
fname = self.request.get('name')
_id = bson.ObjectId(cid)
container, _ = self._get(_id, 'ro')
fpath = os.path.join(self.app.config['data_path'], str(_id)[-3:] + '/' + str(_id), fname)
for a_info in container['attachments']:
if (a_info['name'] + a_info['ext']) == fname:
self.abort(404, 'no such file')
if self.request.method == 'GET':
self.response.app_iter = open(fpath, 'rb')
self.response.headers['Content-Length'] = str(a_info['size']) # must be set after setting app_iter
self.response.headers['Content-Type'] = 'application/octet-stream'
ticket = util.download_ticket('single', fpath, fname, a_info['size'])
tkt_id = self.app.db.downloads.insert(ticket)
return {'url': self.uri_for('download', _full=True, ticket=tkt_id)}
def delete_attachment(self, cid):
"""Delete one attachment."""
fname = self.request.get('name')
_id = bson.ObjectId(cid)
container, _ = self._get(_id, 'rw')
fpath = os.path.join(self.app.config['data_path'], str(_id)[-3:] + '/' + str(_id), fname)
for a_info in container['attachments']:
if (a_info['name'] + a_info['ext']) == fname:
self.abort(404, 'no such file')
name, ext = os.path.splitext(fname)
success = self.dbc.update({'_id': _id, 'attachments.name': fname}, {'$pull': {'attachments': {'name': fname}}})
if not success['updatedExisting']:
log.info('could not remove database entry.')
if os.path.exists(fpath):
log.info('removed file %s' % fpath)
log.info('could not remove file, file %s does not exist' % fpath)