import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re
from lib.collectors.stat_collector import DiskResourcesStatCollector, DiskWauForecast022020StatCollector
import logging
import yaml
from lib.analyzers.calc import *
from lib.constants import *
loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
for logger in loggers:
logger.setLevel(logging.WARNING)
from sklearn.linear_model import Ridge
import random
# Интервал импорта данных
data_sd = '2019-12-01'
data_ed = '2020-04-07'
# Интервал предсказывания
fc_start = pd.to_datetime(data_ed)
fc_end = pd.to_datetime('2021-01-01')
# Импорт предзаказов
orders = yaml.safe_load(open('../conf/orders.yaml'))
# Импорт железных кондукторных групп, по которым строются предсказания
disk_cnames = yaml.safe_load(open('../conf/cnames.yaml'))
# Фильтрация окружений кулауда, частные графики строятся по окружениям потребляющим больше X% указанного ресурса
qloud_filter_column = 'cpu_total'
qloud_min_perc = 1.5
# Загрузка данных по заданному интервалу
disk_data = DiskResourcesStatCollector().download_data((data_sd, data_ed))
df = pd.DataFrame(disk_data)
df[fd] = pd.to_datetime(df[fd])
ddb = df[df['cgroup'].isin(['mdb', 'disk.sync-dataapi'])]
ddb = ddb.sort_values(fd).set_index(fd)
ddb
tddb = ddb[ddb[cg] == 'mdb'][['fqdn', 'disk_total', 'disk_usage']]
tddb
def get_trend_df(_df, sd, ed, key_col, index_name='fielddate'):
df = _df.copy(0).fillna(0)
X = df.index.astype(np.int64) / 10**9
Y = df[key_col].values
model = Ridge(1000)
model.fit([[x, x**0.5, x**0.7] for x in X], Y)
trend_range = pd.date_range(sd, ed)
trend = pd.DataFrame()
trend[index_name] = trend_range
trend.set_index(index_name, inplace=True)
trend[key_col] = model.predict([[x, x**0.5, x**0.7] for x in trend_range.astype(np.int64) / 10**9])
trend[key_col].fillna(0, inplace=True)
return trend
def build_forecast_in_column(df, key_column, extend_columns, fc_start, fc_end):
fc_data = df[[key_column] + extend_columns]
trend = get_trend_df(fc_data, fc_start, fc_end, key_column)
for e in extend_columns:
trend[e] = fc_data[e][-1]
return fc_data.append(trend)[[key_column] + extend_columns]
def plot_forecasted_chart(df, val_col, lim_col, forecast_start_date, ax):
forecast_start_date = pd.to_datetime(forecast_start_date)
plot_res = df[val_col][:forecast_start_date].plot(ax=ax, y=val_col, label=val_col)
clr = plot_res.lines[-1].get_color()
df[val_col][forecast_start_date:].plot(ax=ax, y=val_col, linestyle='--', label='_nolegend_', color=clr)
plot_res = df[lim_col][:forecast_start_date].plot(ax=ax, y=lim_col, label=lim_col)
clr = plot_res.lines[-1].get_color()
df[lim_col][forecast_start_date:].plot(ax=ax, y=lim_col, linestyle='--', label='_nolegend_', color=clr)
ax.tick_params(axis='x', labelsize='small')
ax.xaxis.label.set_visible(False)
ax.set_ylim(0, None)
ax.grid(True, linestyle='--', linewidth=0.5, )
usg = 'disk_usage'
tot = 'disk_total'
sum_mdb = tddb.copy()
sum_mdb = sum_mdb[['fqdn', 'disk_usage', 'disk_total']].groupby([sum_mdb.index]).sum()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
fig.suptitle('Total MDB disk usage', fontsize=20, y=0.96)
fig.autofmt_xdate()
prep_data = build_forecast_in_column(smooth(sum_mdb), usg, [tot], fc_start, fc_end)
plot_forecasted_chart(prep_data, usg, tot, fc_start, ax)
mdb_names = list(sorted(tddb['fqdn'].unique()))
for i, name in enumerate(mdb_names):
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
fig.suptitle(name, fontsize=20, y=0.96)
fig.autofmt_xdate()
prep_data = build_forecast_in_column(smooth(tddb[tddb['fqdn'] == name]), usg, [tot], fc_start, fc_end)
plot_forecasted_chart(prep_data, usg, tot, fc_start, ax)
# fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 4))
# fig.suptitle('Stacked MDB Usage', fontsize=20, y=0.96)
# fig.autofmt_xdate()
#
#
# stacked_df = pd.DataFrame(index=tddb.index.unique())
#
# for name in mdb_names:
# prep_data = build_forecast_in_column((tddb[tddb['fqdn'] == name]), usg, [tot], fc_start, fc_end)
# duples = prep_data.duplicated(keep='first')
# duples = ~duples
# prep_data = prep_data[duples]
# prep_data.drop(index=prep_data.index.duplicated(keep='first'))
#