Source code for bpo.visualizers

import matplotlib.pyplot as plt
import pandas


[docs]def boxplot(series):
    """
    Creates a boxplot for each element in the data series.
    The data series is a dictionary. Each key is used as the label of the boxplot.
    Each value is a list of numerical data that is used to construct the boxplot.

    :param series: a dictionary that maps a label to a list of numerical data.
    """
    plt.boxplot(series.values())
    plt.xticks(ticks=[i for i in range(1, len(series)+1)], labels=series.keys(), rotation=45)
    plt.show()


[docs]def line_with_ci(series):
    """
    Creates a line graph for the data series.
    The data series is a dictionary. Each key is a numerical value that represents an x-coordinate.
    Each value is a pair of numerical values, where the first element is the y-coordinate and the
    second element is an interval ci around the y-coordinate. A line graph is creates based on the
    (x, y) values with a bar around the y-ci, y+ci values. Typically, the ci value represents the
    confidence interval.

    :param series: a dictionary that maps numerical x values to (y, ci) numerical pairs.
    """
    x = series.keys()
    y = [mean for (mean, h) in series.values()]
    ci_bottom = [mean-h for (mean, h) in series.values()]
    ci_top = [mean+h for (mean, h) in series.values()]

    plt.plot(x, y)
    plt.fill_between(x, ci_bottom, ci_top, color='blue', alpha=0.1)
    plt.show()


[docs]def statistics(log, datetime_format="%Y/%m/%d %H:%M:%S"):
    """
    Creates statistics for the interarrival time and the processing times of the given log.
    Returns the statistics as a dictionary with the labels of the statistics as keys and as values lists
    with all the observed times. The log must contain the columns case_id, task, resource, start_time, completion_time.

    :param log: a pandas dataframe containing the log.
    :param datetime_format: optional parameter with the datetime formatting rule that will be used to interpret the start and completion timestamps
    """
    df = log.copy()
    df['start_time'] = pandas.to_datetime(df['start_time'], format=datetime_format)
    df['completion_time'] = pandas.to_datetime(df['completion_time'], format=datetime_format)
    df['duration'] = df[['start_time', 'completion_time']].apply(lambda tss: (tss[1] - tss[0]).total_seconds() / 3600, axis=1)

    df_cases = df.groupby('case_id').agg(case_start=('start_time', 'min'), case_complete=('start_time', 'min'), trace=('task', lambda tss: list(tss)))
    df_cases = df_cases.sort_values(by='case_start')

    task_types = df['task'].unique()

    resources = df['resource'].unique()

    interarrival_times = []
    last_arrival_time = None
    processing_times = dict()
    for tt in task_types:
        processing_times[tt] = list(df[df['task'] == tt]['duration'])
    for index, row in df_cases.iterrows():
        if last_arrival_time is not None:
            interarrival_times.append((row['case_start'] - last_arrival_time).total_seconds() / 3600)
        last_arrival_time = row['case_start']

    return {'Interarrrival times': interarrival_times, **processing_times}