In [233]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re

from lib.collectors.stat_collector import DiskResourcesStatCollector, DiskWauForecast022020StatCollector
import logging
import yaml
from lib.analyzers.calc import *
from lib.constants import *

loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
    logger.setLevel(logging.WARNING)

from sklearn.linear_model import Ridge
import random
In [234]:
# Интервал импорта данных
data_sd = '2019-12-01'
data_ed = '2020-04-07'

# Интервал предсказывания 
fc_start = pd.to_datetime(data_ed)
fc_end = pd.to_datetime('2021-01-01')

# Импорт предзаказов
orders = yaml.safe_load(open('../conf/orders.yaml'))
# Импорт железных кондукторных групп, по которым строются предсказания
disk_cnames = yaml.safe_load(open('../conf/cnames.yaml'))

# Фильтрация окружений кулауда, частные графики строятся по окружениям потребляющим больше X% указанного ресурса
qloud_filter_column = 'cpu_total'
qloud_min_perc = 1.5
In [235]:
# Загрузка данных по заданному интервалу
disk_data = DiskResourcesStatCollector().download_data((data_sd, data_ed))
df = pd.DataFrame(disk_data)


df[fd] = pd.to_datetime(df[fd])
In [236]:
ddb = df[df['cgroup'].isin(['mdb', 'disk.sync-dataapi'])]
ddb = ddb.sort_values(fd).set_index(fd)
ddb

tddb = ddb[ddb[cg] == 'mdb'][['fqdn', 'disk_total', 'disk_usage']]
tddb
Out[236]:
fqdn disk_total disk_usage
fielddate
2019-12-01 datasync-24 4608.0 2666.646763
2019-12-01 datasync-23 4608.0 3555.442837
2019-12-01 datasync-22 4608.0 1318.597824
2019-12-01 datasync-21 4608.0 1323.534496
2019-12-01 datasync-20 4608.0 3795.855495
2019-12-01 datasync-19 4608.0 3834.531830
2019-12-01 datasync-18 4608.0 2787.743950
2019-12-01 datasync-17 4608.0 2501.596691
2019-12-01 datasync-16 6144.0 4422.913067
2019-12-01 datasync-15 6144.0 4378.074787
2019-12-01 datasync-14 4608.0 3665.841309
2019-12-01 datasync-01 4500.0 3008.401226
2019-12-01 datasync-12 4608.0 3353.422848
2019-12-01 datasync-02 4500.0 3008.551342
2019-12-01 datasync-03 4500.0 3005.652744
2019-12-01 datasync-04 4500.0 3022.706390
2019-12-01 datasync-05 4500.0 3023.040867
2019-12-01 datasync-06 4500.0 3030.359833
2019-12-01 datasync-13 4608.0 3441.666321
2019-12-01 datasync-08 4500.0 3010.543781
2019-12-01 datasync-09 4500.0 3008.680134
2019-12-01 datasync-10 4608.0 3192.021175
2019-12-01 datasync-11 4608.0 3518.312611
2019-12-01 datasync-07 4500.0 2812.152790
2019-12-02 datasync-24 4608.0 2678.881992
2019-12-02 datasync-23 4608.0 3684.713066
2019-12-02 datasync-22 4608.0 1327.747826
2019-12-02 datasync-21 4608.0 1333.825588
2019-12-02 datasync-20 4608.0 3801.223000
2019-12-02 datasync-19 4608.0 3837.668983
... ... ... ...
2020-04-06 datasync-06 4500.0 3049.129940
2020-04-06 datasync-07 4500.0 2864.837891
2020-04-06 datasync-08 4500.0 3039.880322
2020-04-06 datasync-09 4500.0 3033.091362
2020-04-06 datasync-10 4608.0 3267.018078
2020-04-06 datasync-05 4500.0 3046.298019
2020-04-07 datasync-15 6144.0 3966.138821
2020-04-07 datasync-16 6144.0 4116.509815
2020-04-07 datasync-17 5100.0 4080.818462
2020-04-07 datasync-18 4950.0 4219.900486
2020-04-07 datasync-20 6159.0 3851.852573
2020-04-07 datasync-21 4608.0 3225.034775
2020-04-07 datasync-22 4608.0 3216.137394
2020-04-07 datasync-23 6144.0 3957.064236
2020-04-07 datasync-14 4608.0 3875.710625
2020-04-07 datasync-19 5208.0 3903.598831
2020-04-07 datasync-13 4608.0 3906.490398
2020-04-07 datasync-06 4500.0 3050.805912
2020-04-07 datasync-11 5040.0 4129.942902
2020-04-07 datasync-10 4608.0 3265.998802
2020-04-07 datasync-09 4500.0 3029.668537
2020-04-07 datasync-08 4500.0 3037.783234
2020-04-07 datasync-07 4500.0 2867.751377
2020-04-07 datasync-05 4500.0 3048.643944
2020-04-07 datasync-04 4500.0 3032.057407
2020-04-07 datasync-03 4500.0 3022.421459
2020-04-07 datasync-02 4500.0 3032.681702
2020-04-07 datasync-01 4500.0 3035.584023
2020-04-07 datasync-12 4608.0 3918.690853
2020-04-07 datasync-24 3500.0 2796.933514

3095 rows × 3 columns

In [237]:
def get_trend_df(_df, sd, ed, key_col, index_name='fielddate'):
    df = _df.copy(0).fillna(0)
    X = df.index.astype(np.int64) / 10**9
    Y = df[key_col].values
    model = Ridge(1000)
    model.fit([[x, x**0.5, x**0.7] for x in X], Y)
    
    trend_range = pd.date_range(sd, ed)
    trend = pd.DataFrame()
    trend[index_name] = trend_range
    trend.set_index(index_name, inplace=True)
    trend[key_col] = model.predict([[x, x**0.5, x**0.7] for x in trend_range.astype(np.int64) / 10**9])
    trend[key_col].fillna(0, inplace=True)
    return trend
In [238]:
def build_forecast_in_column(df, key_column, extend_columns, fc_start, fc_end):

    fc_data = df[[key_column] + extend_columns]
    trend = get_trend_df(fc_data, fc_start, fc_end, key_column)
    for e in extend_columns:
        trend[e] = fc_data[e][-1]
    
    return fc_data.append(trend)[[key_column] + extend_columns]

def plot_forecasted_chart(df, val_col, lim_col, forecast_start_date, ax):
    forecast_start_date = pd.to_datetime(forecast_start_date)

    plot_res = df[val_col][:forecast_start_date].plot(ax=ax, y=val_col, label=val_col)  
    clr = plot_res.lines[-1].get_color()
    df[val_col][forecast_start_date:].plot(ax=ax, y=val_col, linestyle='--', label='_nolegend_', color=clr)
    
    plot_res = df[lim_col][:forecast_start_date].plot(ax=ax, y=lim_col, label=lim_col)  
    clr = plot_res.lines[-1].get_color()
    df[lim_col][forecast_start_date:].plot(ax=ax, y=lim_col, linestyle='--', label='_nolegend_', color=clr)
    
    ax.tick_params(axis='x', labelsize='small')
    ax.xaxis.label.set_visible(False)
    ax.set_ylim(0, None)
    ax.grid(True, linestyle='--', linewidth=0.5, )
    
In [239]:
usg = 'disk_usage'
tot = 'disk_total'

sum_mdb = tddb.copy()
sum_mdb = sum_mdb[['fqdn', 'disk_usage', 'disk_total']].groupby([sum_mdb.index]).sum()


fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
fig.suptitle('Total MDB disk usage', fontsize=20, y=0.96)
fig.autofmt_xdate()

prep_data = build_forecast_in_column(smooth(sum_mdb), usg, [tot], fc_start, fc_end)
plot_forecasted_chart(prep_data, usg, tot, fc_start, ax)
In [240]:
mdb_names = list(sorted(tddb['fqdn'].unique()))

for i, name in enumerate(mdb_names):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
    fig.suptitle(name, fontsize=20, y=0.96)
    fig.autofmt_xdate()

    prep_data = build_forecast_in_column(smooth(tddb[tddb['fqdn'] == name]), usg, [tot], fc_start, fc_end)
    plot_forecasted_chart(prep_data, usg, tot, fc_start, ax)
In [241]:
# fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
# fig.suptitle('Stacked MDB Usage', fontsize=20, y=0.96)
# fig.autofmt_xdate()
# 
# 
# stacked_df = pd.DataFrame(index=tddb.index.unique())
# 
# for name in mdb_names:
#     prep_data = build_forecast_in_column((tddb[tddb['fqdn'] == name]), usg, [tot], fc_start, fc_end)
#     duples = prep_data.duplicated(keep='first')
#     duples = ~duples
#     prep_data = prep_data[duples]
#     prep_data.drop(index=prep_data.index.duplicated(keep='first'))
#     
In [241]: