Source code for datacatalog.dcat

import typing as T
from enum import Enum

import jsonschema


[docs]class Direction(Enum): GET = 0 PUT = 1
[docs]class Type(object): def __init__(self, *args, title=None, description=None, required=False, default=None, examples=None, format=None, read_only=None, write_only=None, sys_defined=None, **kwargs): if len(args) > 0 or len(kwargs) > 0: raise ValueError() self.title = title self.description = description self.required = required self.default = default self.sys_defined = sys_defined self.examples = examples self.format = format self.read_only = read_only self.write_only = write_only @property def schema(self) -> dict: retval = {} if self.title is not None: retval['title'] = self.title if self.description is not None: retval['description'] = self.description if self.default is not None: retval['default'] = self.default() if callable(self.default) else self.default if self.sys_defined is not None: retval['sysDefined'] = self.sys_defined if self.examples is not None: retval['examples'] = self.examples if self.format is not None: retval['format'] = self.format if self.read_only is not None: retval['readOnly'] = self.read_only if self.write_only is not None: retval['writeOnly'] = self.write_only return retval
[docs] def full_text_search_representation(self, data) -> T.Optional[str]: return None
[docs] def validate(self, data): """Validate the data. :returns: the Type for which validation succeeded. See also :meth:`OneOf.validate` :rtype: Type """ jsonschema.validate(data, self.schema) return self
# noinspection PyMethodMayBeStatic
[docs] def canonicalize(self, data, direction=Direction.GET): revert_to_default = data is None and self.required is not None # Currently for sys-defined values the default is used (can be callable) sys_override = self.sys_defined is True and direction is Direction.PUT if revert_to_default or sys_override: return self.default() if callable(self.default) else self.default return data
[docs]class List(Type): def __init__(self, item_type: Type, *args, required=False, default=None, allow_empty=True, unique_items=None, **kwargs): if default is None and required and allow_empty: default = [] super().__init__(*args, required=required, default=default, **kwargs) self.item_type = item_type self.allow_empty = allow_empty self.unique_items = unique_items @property def schema(self) -> dict: retval = dict(super().schema) retval.update({ 'type': 'array', 'items': self.item_type.schema }) if self.unique_items is not None: retval['uniqueItems'] = bool(self.unique_items) if not self.allow_empty: retval['minItems'] = 1 return retval
[docs] def canonicalize(self, data: T.Optional[list], **kwargs) -> T.Optional[list]: data = super().canonicalize(data, **kwargs) if data is None: return None if not isinstance(data, list): raise TypeError("{}: not a list".format(data)) retval = [] for datum in data: value = self.item_type.canonicalize(datum, **kwargs) if value is not None: retval.append(value) return retval
[docs] def full_text_search_representation(self, data: T.Iterable): """We must check whether the given data is really a list, jsonld may flatten lists.""" if type(data) is list: retval = '\n\n'.join([ self.item_type.full_text_search_representation(v) for v in data if v is not None ]) return retval if len(retval) > 0 else None return self.item_type.full_text_search_representation(data)
[docs]class OneOf(Type): def __init__(self, *types, **kwargs): super().__init__(**kwargs) self.types = list(types) @property def schema(self) -> dict: retval = dict(super().schema) retval['oneOf'] = [v.schema for v in self.types] return retval
[docs] def validate(self, data) -> Type: for type in self.types: try: jsonschema.validate(data, self.schema) return type except jsonschema.ValidationError: pass raise jsonschema.ValidationError("Not valid for any type")
[docs] def full_text_search_representation(self, data: T.Any): raise NotImplementedError()
[docs] def canonicalize(self, data: T.Any, **kwargs): return self.validate(data).canonicalize(data, **kwargs)
[docs]class Object(Type): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.properties: T.List[T.Tuple[str, Type]] = [] @property def property_names(self): return [x[0] for x in self.properties] def __getitem__(self, item): for name, value in self.properties: if name == item: return value raise KeyError()
[docs] def add(self, name, value, before=None): if name in self.property_names: raise ValueError() property = (name, value) if before is None: self.properties.append(property) else: insert_position = self.property_names.index(before) self.properties.insert(insert_position, property) return self
@property def schema(self) -> dict: retval = dict(super().schema) retval.update({ 'type': 'object', 'properties': { name: value.schema for name, value in self.properties }, 'x-order': [name for name, value in self.properties] }) required = [name for name, value in self.properties if value.required] if len(required) > 0: retval['required'] = required return retval
[docs] def full_text_search_representation(self, data: dict): ftsr = ( value.full_text_search_representation(data[key]) for key, value in self.properties if key in data ) retval = '\n\n'.join(v for v in ftsr if v is not None) return retval if len(retval) > 0 else None
[docs] def canonicalize(self, data: dict, **kwargs): data = super().canonicalize(data, **kwargs) if data is None: return None if not isinstance(data, dict): raise TypeError("{}: not a dict".format(data)) retval = {} for key, type_ in self.properties: canonical_value = None if type_.sys_defined is True: type_data = data[key] if key in data else None canonical_value = type_.canonicalize(type_data, **kwargs) elif key in data: canonical_value = type_.canonicalize(data[key], **kwargs) if canonical_value is not None: retval[key] = canonical_value return retval
[docs]class String(Type): def __init__(self, *args, pattern=None, max_length=None, allow_empty=False, **kwargs): super().__init__(*args, **kwargs) self.pattern = pattern self.max_length = max_length self.allow_empty = allow_empty @property def schema(self) -> dict: retval = dict(super().schema) retval['type'] = 'string' if self.pattern is not None: retval['pattern'] = self.pattern if self.max_length is not None: retval['maxLength'] = self.max_length if not self.allow_empty: retval['minLength'] = 1 return retval
[docs] def full_text_search_representation(self, data: str): return data
[docs] def canonicalize(self, data: str, **kwargs): data = super().canonicalize(data, **kwargs) if data is None: return None if not isinstance(data, str): raise TypeError("{}: not a string".format(data)) retval = data.strip().replace('\r\n', '\n') return retval if len(retval) > 0 or self.allow_empty else None
[docs]class PlainTextLine(String): def __init__(self, *args, pattern=None, **kwargs): assert pattern is None super().__init__(*args, pattern=r'^[^\n\r]*?\S[^\n\r]*$', **kwargs)
[docs]class Date(String): def __init__(self, *args, format=None, pattern=None, **kwargs): assert format is None and pattern is None super().__init__(*args, format='date', pattern=r'^\d\d\d\d-[01]\d-[0-3]\d(?:T[012]\d:[0-5]\d:[0-5]\d(?:\.\d+)?)?(?:Z|[01]\d(?::[0-5]\d)?)?$', **kwargs)
[docs] def canonicalize(self, data: str, **kwargs) -> T.Optional[str]: data = super().canonicalize(data, **kwargs) if data is None: return None if not isinstance(data, str): raise TypeError("{}: not a string".format(data)) return data[:10]
[docs]class Language(String): def __init__(self, *args, format=None, pattern=None, **kwargs): assert format is None and pattern is None super().__init__(*args, format='lang', pattern=r'^(?:lang1:\w\w|lang2:\w\w\w)$', **kwargs)
[docs]class Enum(String): def __init__(self, values, *args, allow_empty=None, **kwargs): assert allow_empty is None super().__init__(*args, **kwargs) self.values = values self.dict = {key: value for key, value in values} @property def schema(self) -> dict: retval = dict(super().schema) retval['enum'] = [v[0] for v in self.values] retval['enumNames'] = [v[1] for v in self.values] return retval
[docs] def full_text_search_representation(self, data: str): return self.dict[data]
[docs]class Integer(Type): def __init__(self, *args, multipleOf=None, maximum=None, exclusiveMaximum=None, minimum=None, exclusiveMinimum=None, **kwargs): super().__init__(*args, **kwargs) self.multipleOf = multipleOf self.maximum = maximum self.exclusiveMaximum = exclusiveMaximum self.minimum = minimum self.exclusiveMinimum = exclusiveMinimum @property def schema(self) -> dict: retval = dict(super().schema) retval['type'] = 'number' for k in {'multipleOf', 'maximum', 'exclusiveMaximum', 'minimum', 'exclusiveMinimum'}: v = getattr(self, k) if v is not None: assert isinstance(v, int) retval[k] = v return retval
[docs] def full_text_search_representation(self, data: T.Any): return str(data) if isinstance(data, int) else None
[docs] def canonicalize(self, data, **kwargs): data = super().canonicalize(data, **kwargs) if data is None: return None if isinstance(data, int): return data if isinstance(data, str): retval = int(data.strip()) if len(str(retval)) != len(data): raise ValueError("{}: not an integer".format(data)) return retval raise TypeError("{}: not an integer".format(data))
DISTRIBUTION = Object() DISTRIBUTION.add('dct:title', String()) DISTRIBUTION.add('dct:description', String()) DISTRIBUTION.add('dct:issued', Date()) DISTRIBUTION.add('dct:modified', Date()) DISTRIBUTION.add('dc:identifier', PlainTextLine()) DISTRIBUTION.add('dct:license', String()) DISTRIBUTION.add('dct:rights', String()) DISTRIBUTION.add('dcat:accessURL', String(format='uri')) DISTRIBUTION.add('dcat:downloadURL', String(format='uri')) DISTRIBUTION.add('dcat:mediaType', String(pattern=r'^[-\w.]+/[-\w.]+$')) DISTRIBUTION.add('dct:format', String()) DISTRIBUTION.add('dcat:byteSize', Integer(minimum=0)) VCARD = Object() VCARD.add('vcard:fn', PlainTextLine(required=True)) FOAF_AGENT = Object() FOAF_AGENT.add('foaf:name', PlainTextLine(required=True)) DATASET = Object() DATASET.add('dct:title', String()) DATASET.add('dct:description', String()) DATASET.add('dct:issued', Date()) DATASET.add('dct:modified', Date()) DATASET.add('dct:identifier', PlainTextLine()) DATASET.add('dcat:keyword', List(PlainTextLine())) DATASET.add('dct:language', Language()) DATASET.add('dcat:contactPoint', VCARD) DATASET.add('dct:Temporal', String()) DATASET.add('dct:Spatial', String()) DATASET.add('dct:accrualPeriodicity', String()) DATASET.add('dcat:landingPage', String(format='uri')) DATASET.add('dcat:theme', String(format='uri')) DATASET.add('dct:publisher', FOAF_AGENT) DATASET.add('dcat:distribution', DISTRIBUTION) # import json # print(json.dumps( # DATASET.schema, # indent=' ', sort_keys=True # ))