Source code for kwutil.util_json

"""
json utilities for debugging serializability and attempting to ensure it in
some cases.
"""
import os
import copy
import decimal
import fractions
import pathlib
import ubelt as ub
from collections import OrderedDict

try:
    import numpy as np
except ImportError:
    np = None



[docs]
def debug_json_unserializable(data, msg=''):
    """
    Raises an exception if the data is not serializable and prints information
    about it. This is a thin wrapper around :func:`find_json_unserializable`.
    """
    unserializable = list(find_json_unserializable(data))
    if unserializable:
        raise Exception(msg + ub.urepr(unserializable))




[docs]
def ensure_json_serializable(dict_, normalize_containers=False, verbose=0,
                             unhandled_policy='keep'):
    """
    Attempt to convert common types (e.g. numpy) into something json compliant

    Convert numpy and tuples into lists. Attempts to decode bytes as utf8, but
    will skip if this is not possible.

    Args:
        dict_ (List | Dict):
            A data structure nearly compatible with json. (todo: rename arg)

        normalize_containers (bool):
            if True, normalizes dict containers to be standard python
            structures. Defaults to False.

        unhandled_policy (str):
            What to do if there isn't a straighforward way to convert to
            a serializable structure. Can be "keep", "error" or "stringify".

    Returns:
        Dict | List:
            normalized data structure that should be entirely json
            serializable.

    Note:
        This was ported from kwcoco.util

    Example:
        >>> from kwutil.util_json import *  # NOQA
        >>> assert ensure_json_serializable([]) == []
        >>> assert ensure_json_serializable({}) == {}
        >>> data = [pathlib.Path('.')]
        >>> assert ensure_json_serializable(data) == ['.']
        >>> assert ensure_json_serializable(data) != data

    Example:
        >>> # by default non-serializable objects are kept-as-is
        >>> data = [[], {}, object(), (1, 2)]
        >>> ensure_json_serializable(data)
        >>> ensure_json_serializable(data, unhandled_policy='stringify')
        >>> #ensure_json_serializable(data, unhandled_policy='pickle')
        >>> import pytest
        >>> with pytest.raises(Exception):
        >>>     ensure_json_serializable(data, unhandled_policy='error')

    Example:
        >>> # xdoctest: +REQUIRES(module:numpy)
        >>> from kwutil.util_json import *  # NOQA
        >>> data = ub.ddict(lambda: int)
        >>> data['foo'] = ub.ddict(lambda: int)
        >>> data['bar'] = np.array([1, 2, 3])
        >>> data['foo']['a'] = 1
        >>> data['foo']['b'] = (1, np.array([1, 2, 3]), {3: np.int32(3), 4: np.float16(1.0)})
        >>> dict_ = data
        >>> print(ub.urepr(data, nl=-1))
        >>> assert list(find_json_unserializable(data))
        >>> result = ensure_json_serializable(data, normalize_containers=True)
        >>> print(ub.urepr(result, nl=-1))
        >>> assert not list(find_json_unserializable(result))
        >>> assert type(result) is dict
    """
    dict_ = copy.deepcopy(dict_)

    scalar_types = (int, float, str, type(None))
    container_types = (tuple, list, dict)
    serializable_types = scalar_types + container_types

    def _norm_container(c):
        if isinstance(c, dict):
            # Cast to a normal dictionary
            if isinstance(c, OrderedDict):
                if type(c) is not OrderedDict:
                    c = OrderedDict(c)
            else:
                if type(c) is not dict:
                    c = dict(c)
        return c

    # TODO: use the version of the walker with a parent ref for efficient
    # modifications
    walker = ub.IndexableWalker(dict_)
    for prefix, value in walker:
        if isinstance(value, tuple):
            new_value = list(value)
            walker[prefix] = new_value
        elif isinstance(value, set):
            # TODO: do we need to recurse into this set differently?
            new_value = list(value)
            walker[prefix] = new_value
        elif np is not None and isinstance(value, np.ndarray):
            new_value = value.tolist()
            walker[prefix] = new_value
        elif np is not None and isinstance(value, (np.integer)):
            new_value = int(value)
            walker[prefix] = new_value
        elif np is not None and isinstance(value, (np.floating)):
            new_value = float(value)
            walker[prefix] = new_value
        elif np is not None and isinstance(value, (np.complexfloating)):
            new_value = complex(value)
            walker[prefix] = new_value
        elif isinstance(value, bytes):
            try:
                new_value = value.decode()
            except Exception:
                ...
            else:
                walker[prefix] = new_value
        elif isinstance(value, decimal.Decimal):
            new_value = float(value)
            walker[prefix] = new_value
        elif isinstance(value, fractions.Fraction):
            new_value = float(value)
            walker[prefix] = new_value
        elif isinstance(value, pathlib.Path):
            new_value = str(value)
            walker[prefix] = new_value
        elif hasattr(value, '__json__'):
            new_value = value.__json__()
            walker[prefix] = new_value
        else:
            if normalize_containers:
                if isinstance(value, dict):
                    new_value = _norm_container(value)
                    walker[prefix] = new_value

            if unhandled_policy == 'keep':
                # do nothing
                ...
            else:
                if not isinstance(value, serializable_types):
                    if unhandled_policy == 'error':
                        raise Exception(f'Unserializable: value={value!r}')
                    elif unhandled_policy == 'stringify':
                        new_value = f'UNSERIALIZABLE: {value!r}'
                        walker[prefix] = new_value
                    # elif unhandled_policy == 'pickle':
                    #     import pickle
                    #     new_value = pickle.dumps(value)
                    #     walker[prefix] = new_value
                    else:
                        raise KeyError(unhandled_policy)

    if normalize_containers:
        # normalize the outer layer
        dict_ = _norm_container(dict_)
    return dict_




[docs]
def find_json_unserializable(data, quickcheck=False):
    """
    Recurse through json datastructure and find any component that
    causes a serialization error. Record the location of these errors
    in the datastructure as we recurse through the call tree.

    Args:
        data (object): data that should be json serializable
        quickcheck (bool): if True, check the entire datastructure assuming
            its ok before doing the python-based recursive logic.

    Returns:
        List[Dict]: list of "bad part" dictionaries containing items

            'value' - the value that caused the serialization error

            'loc' - which contains a list of key/indexes that can be used
            to lookup the location of the unserializable value.
            If the "loc" is a list, then it indicates a rare case where
            a key in a dictionary is causing the serialization error.

    Note:
        This was ported from kwcoco.util

    Example:
        >>> # xdoctest: +REQUIRES(module:numpy)
        >>> from kwutil.util_json import *  # NOQA
        >>> part = ub.ddict(lambda: int)
        >>> part['foo'] = ub.ddict(lambda: int)
        >>> part['bar'] = np.array([1, 2, 3])
        >>> part['foo']['a'] = 1
        >>> # Create a dictionary with two unserializable parts
        >>> data = [1, 2, {'nest1': [2, part]}, {frozenset({'badkey'}): 3, 2: 4}]
        >>> parts = list(find_json_unserializable(data))
        >>> print('parts = {}'.format(ub.urepr(parts, nl=1)))
        >>> # Check expected structure of bad parts
        >>> assert len(parts) == 2
        >>> part = parts[1]
        >>> assert list(part['loc']) == [2, 'nest1', 1, 'bar']
        >>> # We can use the "loc" to find the bad value
        >>> for part in parts:
        >>>     # "loc" is a list of directions containing which keys/indexes
        >>>     # to traverse at each descent into the data structure.
        >>>     directions = part['loc']
        >>>     curr = data
        >>>     special_flag = False
        >>>     for key in directions:
        >>>         if isinstance(key, list):
        >>>             # special case for bad keys
        >>>             special_flag = True
        >>>             break
        >>>         else:
        >>>             # normal case for bad values
        >>>             curr = curr[key]
        >>>     if special_flag:
        >>>         assert part['data'] in curr.keys()
        >>>         assert part['data'] is key[1]
        >>>     else:
        >>>         assert part['data'] is curr

    Example:
        >>> # xdoctest: +SKIP("TODO: circular ref detect algo is wrong, fix it")
        >>> from kwutil.util_json import *  # NOQA
        >>> import pytest
        >>> # Test circular reference
        >>> data = [[], {'a': []}]
        >>> data[1]['a'].append(data)
        >>> with pytest.raises(ValueError, match="Circular reference detected at.*1, 'a', 1*"):
        ...     parts = list(find_json_unserializable(data))
        >>> # Should be ok here
        >>> shared_data = {'shared': 1}
        >>> data = [[shared_data], shared_data]
        >>> parts = list(find_json_unserializable(data))
    """
    import json
    needs_check = True

    if quickcheck:
        try:
            # Might be a more efficient way to do this check. We duplicate a lot of
            # work by doing the check for unserializable data this way.
            json.dumps(data)
        except Exception:
            # if 'Circular reference detected' in str(ex):
            #     has_circular_reference = True
            # If there is unserializable data, find out where it is.
            # is_serializable = False
            pass
        else:
            # is_serializable = True
            needs_check = False

    # FIXME: the algo is wrong, fails when
    CHECK_FOR_CIRCULAR_REFERENCES = 0

    if needs_check:
        # mode = 'new'
        # if mode == 'new':
        scalar_types = (int, float, str, type(None))
        container_types = (tuple, list, dict)
        serializable_types = scalar_types + container_types
        walker = ub.IndexableWalker(data)

        if CHECK_FOR_CIRCULAR_REFERENCES:
            seen_ids = set()

        for prefix, value in walker:

            if CHECK_FOR_CIRCULAR_REFERENCES:
                # FIXME: We need to know if this container id is in this paths
                # ancestors. It is allowed to be elsewhere in the data
                # structure (i.e. the pointer graph must be a DAG)
                if isinstance(value, container_types):
                    container_id = id(value)
                    if container_id in seen_ids:
                        circ_loc = {'loc': prefix, 'data': value}
                        raise ValueError(f'Circular reference detected at {circ_loc}')
                    seen_ids.add(container_id)

            *root, key = prefix
            if not isinstance(key, scalar_types):
                # Special case where a dict key is the error value
                # Purposely make loc non-hashable so its not confused with
                # an address. All we can know in this case is that they key
                # is at this level, there is no concept of where.
                yield {'loc': root + [['.keys', key]], 'data': key}
            elif not isinstance(value, serializable_types):
                yield {'loc': prefix, 'data': value}




[docs]
class Json:
    """
    Similar to kwutil.Yaml, the Json class provides a set of helpers to make
    working with json easier.

    Example:
        >>> from kwutil.util_json import Json
        >>> import ubelt as ub
        >>> unserializable_data = {
        >>>     'a': 'hello world',
        >>>     'b': ub.udict({'a': 3}),
        >>>     'c': ub.Path('a/path/object'),
        >>> }
        >>> data = Json.ensure_serializable(unserializable_data)
        >>> text1 = Json.dumps(data, backend='stdlib')
        >>> # Coerce is idempotent and resolves the input to nested Python
        >>> # structures.
        >>> resolved1 = Json.coerce(data)
        >>> resolved2 = Json.coerce(text1)
        >>> resolved3 = Json.coerce(resolved2)
        >>> assert resolved1 == resolved2 == resolved3 == data
        >>> # with stdlib
        >>> data2 = Json.loads(text1)
        >>> assert data2 == data
        >>> # with ujson
        >>> # xdoctest: +REQUIRES(module:ujson)
        >>> data2 = Json.loads(text1, backend='ujson')
        >>> assert data2 == data
        >>> # with orjson
        >>> # xdoctest: +REQUIRES(module:orjson)
        >>> data2 = Json.loads(text1, backend='orjson')
        >>> assert data2 == data
    """


[docs]
    @staticmethod
    def _load_filepointer(filepointer, backend='stdlib'):
        if backend == 'stdlib':
            import json
            data = json.load(filepointer)
        elif backend == 'ujson':
            import ujson
            data = ujson.load(filepointer)
        elif backend == 'orjson':
            import orjson
            data = orjson.loads(filepointer.read())
        else:
            raise NotImplementedError(backend)
        return data



[docs]
    @staticmethod
    def load(file, backend='stdlib'):
        """
        Load json from a filepointer or filepath.

        Args:
            file (Path | str | _io._IOBase):
                a path to a file, or an open file descriptor in bytes or str
                mode. bytes mode is more efficient.

        Example:
            >>> import kwutil
            >>> import io
            >>> # test loading from string or byte file pointers
            >>> data = b'["hello", {"from": "json"}]'
            >>> r1 = kwutil.Json.load(io.BytesIO(data), backend='stdlib')
            >>> r2 = kwutil.Json.load(io.StringIO(data.decode()), backend='stdlib')
            >>> # xdoctest: +REQUIRES(module:ujson)
            >>> r3 = kwutil.Json.load(io.BytesIO(data), backend='ujson')
            >>> r4 = kwutil.Json.load(io.StringIO(data.decode()), backend='ujson')
            >>> # xdoctest: +REQUIRES(module:orjson)
            >>> r3 = kwutil.Json.load(io.BytesIO(data), backend='orjson')
            >>> r4 = kwutil.Json.load(io.StringIO(data.decode()), backend='orjson')
            >>> assert r1 == r2 == r3 == r4
        """
        if isinstance(file, (str, os.PathLike)):
            fpath = file
            with open(fpath, 'rb') as fp:
                return Json._load_filepointer(fp, backend=backend)
        else:
            return Json._load_filepointer(file, backend=backend)



[docs]
    @staticmethod
    def loads(text, backend='stdlib'):
        """
        Decode json from bytes or text
        """
        if backend == 'stdlib':
            import json
            data = json.loads(text)
        elif backend == 'ujson':
            import ujson
            data = ujson.loads(text)
        elif backend == 'orjson':
            import orjson
            data = orjson.loads(text)
        else:
            raise NotImplementedError(backend)
        return data



[docs]
    @staticmethod
    def dump(data, fp, backend='stdlib', **kwargs):
        """
        Write json data to a file with a chosen backend.

        Args:
            data (dict | list | int | float | str): json serializable data.
            fp (PathLike | IO): Where to write the data
            backend (str): stdlib, ujson, or orjson
            **kwargs : additional arguments to pass to the specific backend.
        """
        if backend == 'stdlib':
            import json
            json.dump(data, fp, **kwargs)
        elif backend == 'ujson':
            import ujson
            ujson.dump(data, fp, **kwargs)
        elif backend == 'orjson':
            import orjson
            fp.write(orjson.dumps(data, **kwargs))
        else:
            raise NotImplementedError(backend)



[docs]
    @staticmethod
    def dumps(data, backend='stdlib', **kwargs):
        """
        Convert json data to text with a chosen backend.

        Args:
            data (dict | list | int | float | str): json serializable data.
            backend (str): stdlib, ujson, or orjson
            **kwargs : additional arguments to pass to the specific backend.
        """
        if backend == 'stdlib':
            import json
            text = json.dumps(data, **kwargs)
        elif backend == 'ujson':
            import ujson
            text = ujson.dumps(data, **kwargs)
        elif backend == 'orjson':
            import orjson
            text = orjson.dumps(data, **kwargs)
        else:
            raise NotImplementedError(backend)
        return text



[docs]
    @classmethod
    def coerce(cls, data, backend='stdlib', path_policy='existing_file_with_extension'):
        """
        Example:
            >>> from kwutil.util_json import Json
            >>> import ubelt as ub
            >>> Json.coerce('[1, 2, 3]')
            [1, 2, 3]
            >>> fpath = ub.Path.appdir('kwutil/tests/util_json').ensuredir() / 'file.json'
            >>> fpath.write_text(Json.dumps([4, 5, 6]))
            >>> Json.coerce(fpath)
            [4, 5, 6]
            >>> Json.coerce(str(fpath))
            [4, 5, 6]
            >>> dict(Json.coerce('{"a": "b", "c": "d"}'))
            {'a': 'b', 'c': 'd'}
            >>> Json.coerce(None)
            None
        """
        import os
        if isinstance(data, os.PathLike):
            result = Json.load(data, backend=backend)
        elif isinstance(data, str):
            maybe_path = None

            if path_policy == 'never':
                ...
            else:
                if path_policy == 'existing_file':
                    path_requires_extension = False
                elif path_policy == 'existing_file_with_extension':
                    path_requires_extension = True
                else:
                    raise KeyError(path_policy)

                if '\n' not in data and len(data.strip()) > 0:
                    # Ambiguous case: might this be path-like?
                    maybe_path = ub.Path(data)
                    try:
                        if not maybe_path.is_file():
                            maybe_path = None
                    except OSError:
                        maybe_path = None

                if maybe_path and path_requires_extension:
                    # If the input looks like a path, try to load it.  This was
                    # added because I tried to coerce "auto" as a string, but
                    # for some reason there was a file "auto" in my cwd and
                    # that was confusing.
                    if '.' not in maybe_path.name:
                        maybe_path = None

            if maybe_path is not None:
                result = Json.coerce(maybe_path, backend=backend)
            else:
                result = Json.loads(data, backend=backend)
        elif hasattr(data, 'read'):
            # assume file
            result = Json.load(data, backend=backend)
        else:
            # Probably already parsed. Return the input
            result = data
        return result



[docs]
    @classmethod
    def find_unserializable(cls, data, quickcheck=False):
        """
        Example:
            >>> import kwutil
            >>> import ubelt as ub
            >>> data = {
            >>>     'a': 1,
            >>>     'b': 2,
            >>>     'c': ub.Path('/pathlib/object')
            >>> }
            >>> results = list(kwutil.Json.find_unserializable(data))
            >>> print(f'results = {ub.urepr(results, nl=1)}')
            results = [
                {'loc': ['c'], 'data': Path('/pathlib/object')},
            ]
        """
        find_json_unserializable.__doc__
        return find_json_unserializable(data, quickcheck)



[docs]
    @classmethod
    def ensure_serializable(cls, dict_, normalize_containers=False, verbose=0,
                            unhandled_policy='keep'):
        """
        Example:
            >>> import kwutil
            >>> import pathlib
            >>> data = {
            >>>     'a': 1,
            >>>     'b': 2,
            >>>     'c': pathlib.Path('/pathlib/object')
            >>> }
            >>> results = kwutil.Json.ensure_serializable(data)
            >>> print(f'results = {ub.urepr(results, nl=1)}')
            results = {
                'a': 1,
                'b': 2,
                'c': '/pathlib/object',
            }
        """
        ensure_json_serializable.__doc__
        return ensure_json_serializable(
            dict_, normalize_containers=normalize_containers, verbose=verbose,
            unhandled_policy=unhandled_policy)



[docs]
    @classmethod
    def debug_unserializable(cls, data, msg=''):
        """
        Raises an exception if the data is not serializable and prints information
        about it. This is a thin wrapper around :func:`Json.find_unserializable`.

        Example:
            >>> import kwutil
            >>> import ubelt as ub
            >>> data = {
            >>>     'a': 1,
            >>>     'b': 2,
            >>>     'c': ub.Path('/pathlib/object')
            >>> }
            >>> try:
            >>>     kwutil.Json.debug_unserializable(data, 'obj had non-json data at: ')
            >>> except Exception as ex:
            >>>     print(f'Exception: {ex}')
            Exception: obj had non-json data at: [
                {'loc': ['c'], 'data': Path('/pathlib/object')},
            ]
        """
        unserializable = list(find_json_unserializable(data))
        if unserializable:
            raise Exception(msg + ub.urepr(unserializable))