Source code for hydrobox.io.random

"""
The io.random module holds a number of functions returning random `pandas.Series` or `numpy.ndarray` objects
that can be used for developing purposes, or whenever randomized data is needed.
"""
import numpy as np
import pandas as pd
from datetime import datetime

from hydrobox.utils.decorators import accept


[docs]@accept(distribution=str, size=(int, 'None'), seed=(int, 'None'), start=(str, datetime, 'None'), end=(str, datetime, 'None'), freq=(str, 'None')) def timeseries_from_distribution(distribution='gamma', distribution_args=[10, 2], size=10, seed=None, start='now', end=None, freq='D'): """Generate a random time series This function will return a ``pandas.Series`` indexed by a ``pandas.DatetimeIndex`` holding random data that is generated by the given distribution. The distribution name has to be importable from ``numpy.random`` and the `distribution_args` list will be passed as `*args`. The `seed` parameter will be directed to ``np.random.seed`` in order to return reproducable pseudo-random results. controlling the datetime ------------------------ The time series index can be controlled by the `size`, `start`, `end` and `freq` parameter. The `size` specifies the total length of the time series. If `None`, the `size` will be inferred from `start`, `end` and `freq`. If `size` is given, `freq` will be ignored. If `end` is is not given, the other three parameter have to be set. .. warning:: You cannot set all 4 time series parameter at the same time. In case you do not want to use one that holds a default value (like start), you will have to set it to `None`. Parameters ---------- distribution : string, default='gamma' Any distribution density function from ``numpy.random`` can be chosen. The distribution properties (like location or scale) can be passed with the parameter `distribution_args`. distribution_args : list, None, default=[10,2] This list will be passed as ``*distribution_args`` into the given density function. If no areguments shall be passed, distribution_args can be set to `None`. size : int, default=10 Specifies the length of the produced time series. seed : int, default=None Will be passed to ``numpy.random.seed``. start : string, datetime, default='now' Starting point for the ``pandas.DatetimeIndex``. Can be either a ``datetime`` or string. The string has either to be `'now'` for using the current time step, or a Datetime string of format `YYYYMMDDHHmmss`, where the time (`HHmmss`) can be omitted. If `end` is used, start or `size` should be set to `None`. end : string, datetime, defualt=None see start. freq : string, default='D' Specify the temporal resulution of the time series. This can either be used in case `size` is omitted, but `start` and `end` are given, or in case either `start` or `end` is omitted but `size` is given. Any string accepted by the `freq` attribute of ``pandas.Grouper`` is accepted. See Also -------- pandas.Grouper : further information of `freq` settings Returns ------- pandas.Series """ # get the function try: distribution_function = getattr(np.random, distribution) except AttributeError: raise ValueError('The distribution %s is not known. It has to be importable from numpy.random' % distribution) # get start and end if isinstance(start, str) and start.lower() == 'now': start = datetime.now() if isinstance(start, datetime): start = start.strftime('%Y%m%d%H%M%S') if isinstance(end, str) and end.lower() == 'now': end = datetime.now() if isinstance(end, datetime): end = end.strftime('%Y%m%d%H%M%S') # build the DatetimeIndex and infer size index = pd.date_range(start=start, end=end, periods=size, freq=freq) inferred_size = len(index) # set the seed if neccessary if seed is not None: np.random.seed(seed=seed) # generate the distribution values = distribution_function(*distribution_args, size=inferred_size) # return the time series return pd.Series(index=index, data=values)