Source code for bpo.visualizers
import matplotlib.pyplot as plt
import pandas
[docs]def boxplot(series):
"""
Creates a boxplot for each element in the data series.
The data series is a dictionary. Each key is used as the label of the boxplot.
Each value is a list of numerical data that is used to construct the boxplot.
:param series: a dictionary that maps a label to a list of numerical data.
"""
plt.boxplot(series.values())
plt.xticks(ticks=[i for i in range(1, len(series)+1)], labels=series.keys(), rotation=45)
plt.show()
[docs]def line_with_ci(series):
"""
Creates a line graph for the data series.
The data series is a dictionary. Each key is a numerical value that represents an x-coordinate.
Each value is a pair of numerical values, where the first element is the y-coordinate and the
second element is an interval ci around the y-coordinate. A line graph is creates based on the
(x, y) values with a bar around the y-ci, y+ci values. Typically, the ci value represents the
confidence interval.
:param series: a dictionary that maps numerical x values to (y, ci) numerical pairs.
"""
x = series.keys()
y = [mean for (mean, h) in series.values()]
ci_bottom = [mean-h for (mean, h) in series.values()]
ci_top = [mean+h for (mean, h) in series.values()]
plt.plot(x, y)
plt.fill_between(x, ci_bottom, ci_top, color='blue', alpha=0.1)
plt.show()
[docs]def statistics(log, datetime_format="%Y/%m/%d %H:%M:%S"):
"""
Creates statistics for the interarrival time and the processing times of the given log.
Returns the statistics as a dictionary with the labels of the statistics as keys and as values lists
with all the observed times. The log must contain the columns case_id, task, resource, start_time, completion_time.
:param log: a pandas dataframe containing the log.
:param datetime_format: optional parameter with the datetime formatting rule that will be used to interpret the start and completion timestamps
"""
df = log.copy()
df['start_time'] = pandas.to_datetime(df['start_time'], format=datetime_format)
df['completion_time'] = pandas.to_datetime(df['completion_time'], format=datetime_format)
df['duration'] = df[['start_time', 'completion_time']].apply(lambda tss: (tss[1] - tss[0]).total_seconds() / 3600, axis=1)
df_cases = df.groupby('case_id').agg(case_start=('start_time', 'min'), case_complete=('start_time', 'min'), trace=('task', lambda tss: list(tss)))
df_cases = df_cases.sort_values(by='case_start')
task_types = df['task'].unique()
resources = df['resource'].unique()
interarrival_times = []
last_arrival_time = None
processing_times = dict()
for tt in task_types:
processing_times[tt] = list(df[df['task'] == tt]['duration'])
for index, row in df_cases.iterrows():
if last_arrival_time is not None:
interarrival_times.append((row['case_start'] - last_arrival_time).total_seconds() / 3600)
last_arrival_time = row['case_start']
return {'Interarrrival times': interarrival_times, **processing_times}