Source code for hydrobox.io.random

"""
The io.random module holds a number of functions returning random `pandas.Series` or `numpy.ndarray` objects
that can be used for developing purposes, or whenever randomized data is needed.
"""
import numpy as np
import pandas as pd
from datetime import datetime

from hydrobox.utils.decorators import accept


[docs]@accept(distribution=str,
        size=(int, 'None'),
        seed=(int, 'None'),
        start=(str, datetime, 'None'),
        end=(str, datetime, 'None'),
        freq=(str, 'None'))
def timeseries_from_distribution(distribution='gamma',
                                 distribution_args=[10, 2], size=10, seed=None,
                                 start='now', end=None, freq='D'):
    """Generate a random time series

    This function will return a ``pandas.Series`` indexed by a
    ``pandas.DatetimeIndex`` holding random data that is generated by the
    given distribution. The distribution name has to be importable from
    ``numpy.random`` and the `distribution_args` list will be passed as
    `*args`. The `seed` parameter will be directed to ``np.random.seed`` in
    order to return reproducable pseudo-random results.

    controlling the datetime
    ------------------------
    The time series index can be controlled by the `size`, `start`, `end`
    and `freq` parameter. The `size` specifies the total length of the time
    series. If `None`, the `size` will be inferred from `start`, `end` and
    `freq`. If `size` is given, `freq` will be ignored. If `end` is is not
    given, the other three parameter have to be set.

    .. warning::
        You cannot set all 4 time series parameter at the same time. In case
        you do not want to use one that holds a default value (like start),
        you will have to set it to `None`.

    Parameters
    ----------
    distribution : string, default='gamma'
        Any distribution density function from ``numpy.random`` can be
        chosen. The distribution properties (like location or scale) can be
        passed with the parameter `distribution_args`.
    distribution_args : list, None, default=[10,2]
        This list will be passed as ``*distribution_args`` into the given
        density function. If no areguments shall be passed,
        distribution_args can be set to `None`.
    size : int, default=10
        Specifies the length of the produced time series.
    seed : int, default=None
        Will be passed to ``numpy.random.seed``.
    start : string, datetime, default='now'
        Starting point for the ``pandas.DatetimeIndex``. Can be either a
        ``datetime`` or string. The string has either to be `'now'` for
        using the current time step, or a Datetime string of format
        `YYYYMMDDHHmmss`, where the time (`HHmmss`) can be omitted.
        If `end` is used, start or `size` should be set to `None`.
    end : string, datetime, defualt=None
        see start.
    freq : string, default='D'
        Specify the temporal resulution of the time series. This can either
        be used in case `size` is omitted, but `start` and `end` are given,
        or in case either `start` or `end` is omitted but `size` is given.
        Any string accepted by the `freq` attribute of ``pandas.Grouper`` is
        accepted.

    See Also
    --------
    pandas.Grouper : further information of `freq` settings

    Returns
    -------
    pandas.Series

    """
    # get the function
    try:
        distribution_function = getattr(np.random, distribution)
    except AttributeError:
        raise ValueError('The distribution %s is not known. It has to be importable from numpy.random' % distribution)

    # get start and end
    if isinstance(start, str) and start.lower() == 'now':
        start = datetime.now()
    if isinstance(start, datetime):
        start = start.strftime('%Y%m%d%H%M%S')

    if isinstance(end, str) and end.lower() == 'now':
        end = datetime.now()
    if isinstance(end, datetime):
        end = end.strftime('%Y%m%d%H%M%S')

    # build the DatetimeIndex and infer size
    index = pd.date_range(start=start, end=end, periods=size, freq=freq)
    inferred_size = len(index)

    # set the seed if neccessary
    if seed is not None:
        np.random.seed(seed=seed)

    # generate the distribution
    values = distribution_function(*distribution_args, size=inferred_size)

    # return the time series
    return pd.Series(index=index, data=values)