The data is used from data.gv.at which is Austrias official open data source.
More about it here
The used datasets are from:
This project has multiple goals.
import datetime
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import plotly.io as pio
# for rendering plot static
# pio.renderers.default = "svg"
data_file_name = 'data/OGD_gest_kalwo_GEST_KALWOCHE_100.csv'
covid_deaths_data_file_name = 'data/timeline-faelle-bundeslaender.csv'
df = pd.read_csv(data_file_name, sep=';')
df_deaths = pd.read_csv(covid_deaths_data_file_name, sep=';')
df
C-KALWOCHE-0 | C-B00-0 | C-ALTERGR65-0 | C-C11-0 | F-ANZ-1 | |
---|---|---|---|---|---|
0 | KALW-200001 | B00-1 | ALTERSGR65-1 | C11-1 | 8 |
1 | KALW-200001 | B00-1 | ALTERSGR65-1 | C11-2 | 2 |
2 | KALW-200001 | B00-1 | ALTERSGR65-2 | C11-1 | 25 |
3 | KALW-200001 | B00-1 | ALTERSGR65-2 | C11-2 | 33 |
4 | KALW-200001 | B00-2 | ALTERSGR65-1 | C11-1 | 7 |
... | ... | ... | ... | ... | ... |
41998 | KALW-202223 | B00-8 | ALTERSGR65-2 | C11-2 | 28 |
41999 | KALW-202223 | B00-9 | ALTERSGR65-1 | C11-1 | 24 |
42000 | KALW-202223 | B00-9 | ALTERSGR65-1 | C11-2 | 13 |
42001 | KALW-202223 | B00-9 | ALTERSGR65-2 | C11-1 | 125 |
42002 | KALW-202223 | B00-9 | ALTERSGR65-2 | C11-2 | 159 |
42003 rows × 5 columns
df_deaths
Datum | BundeslandID | Name | BestaetigteFaelleBundeslaender | Todesfaelle | Genesen | Hospitalisierung | Intensivstation | Testungen | TestungenPCR | TestungenAntigen | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2021-03-01T09:30:00+01:00 | 1 | Burgenland | 12657 | 239 | 11701 | 43 | 10 | 638575 | 155435 | 483140 |
1 | 2021-03-01T09:30:00+01:00 | 2 | Kärnten | 29225 | 683 | 27316 | 96 | 15 | 675557 | 217933 | 457624 |
2 | 2021-03-01T09:30:00+01:00 | 3 | Niederösterreich | 74538 | 1350 | 67900 | 349 | 82 | 3400756 | 1141984 | 2258772 |
3 | 2021-03-01T09:30:00+01:00 | 4 | Oberösterreich | 86409 | 1515 | 82567 | 120 | 20 | 2162517 | 546777 | 1615740 |
4 | 2021-03-01T09:30:00+01:00 | 5 | Salzburg | 37327 | 493 | 35318 | 71 | 15 | 823353 | 274598 | 548755 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4845 | 2022-06-28T09:30:00+02:00 | 6 | Steiermark | 561729 | 3399 | 551626 | 102 | 8 | 21098196 | 5816694 | 15281502 |
4846 | 2022-06-28T09:30:00+02:00 | 7 | Tirol | 367338 | 939 | 361988 | 77 | 2 | 9590745 | 3845911 | 5744834 |
4847 | 2022-06-28T09:30:00+02:00 | 8 | Vorarlberg | 211284 | 549 | 208983 | 20 | 2 | 7088523 | 1543803 | 5544720 |
4848 | 2022-06-28T09:30:00+02:00 | 9 | Wien | 955399 | 4028 | 914608 | 199 | 23 | 65808360 | 58193874 | 7614486 |
4849 | 2022-06-28T09:30:00+02:00 | 10 | Österreich | 4403444 | 18768 | 4293115 | 842 | 47 | 190225970 | 96219579 | 94006391 |
4850 rows × 11 columns
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile2 = ProfileReport(df_deaths, title="Pandas Profiling Report")
# profile2
df = df.rename(columns={"C-KALWOCHE-0": "cal_week", "C-B00-0": "state", "C-C11-0": "gender", "F-ANZ-1": "counts" })
df['year'] = [d.split("-")[1][:4] for d in df['cal_week']]
df['cal_week'] = [d.split("-")[1][4:] for d in df['cal_week']]
df['formatted_date'] = df.year.astype(str) + df.cal_week.astype(str) + '0'
df['date'] = pd.to_datetime(df['formatted_date'], format='%Y%W%w')
# df['datetime'] = pd.to_datetime(df.year.astype(str) + '-' +
# df.cal_week.astype(str) + '-1' , format="%Y-%W-%w").dt.strftime('%Y-%W')
df['conv_date']= df.date.map(datetime.datetime.toordinal)
df
cal_week | state | C-ALTERGR65-0 | gender | counts | year | formatted_date | date | conv_date | |
---|---|---|---|---|---|---|---|---|---|
0 | 01 | B00-1 | ALTERSGR65-1 | C11-1 | 8 | 2000 | 2000010 | 2000-01-09 | 730128 |
1 | 01 | B00-1 | ALTERSGR65-1 | C11-2 | 2 | 2000 | 2000010 | 2000-01-09 | 730128 |
2 | 01 | B00-1 | ALTERSGR65-2 | C11-1 | 25 | 2000 | 2000010 | 2000-01-09 | 730128 |
3 | 01 | B00-1 | ALTERSGR65-2 | C11-2 | 33 | 2000 | 2000010 | 2000-01-09 | 730128 |
4 | 01 | B00-2 | ALTERSGR65-1 | C11-1 | 7 | 2000 | 2000010 | 2000-01-09 | 730128 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41998 | 23 | B00-8 | ALTERSGR65-2 | C11-2 | 28 | 2022 | 2022230 | 2022-06-12 | 738318 |
41999 | 23 | B00-9 | ALTERSGR65-1 | C11-1 | 24 | 2022 | 2022230 | 2022-06-12 | 738318 |
42000 | 23 | B00-9 | ALTERSGR65-1 | C11-2 | 13 | 2022 | 2022230 | 2022-06-12 | 738318 |
42001 | 23 | B00-9 | ALTERSGR65-2 | C11-1 | 125 | 2022 | 2022230 | 2022-06-12 | 738318 |
42002 | 23 | B00-9 | ALTERSGR65-2 | C11-2 | 159 | 2022 | 2022230 | 2022-06-12 | 738318 |
42003 rows × 9 columns
df.groupby('date').agg('sum')
counts | conv_date | |
---|---|---|
date | ||
2000-01-09 | 1867 | 26284608 |
2000-01-16 | 1902 | 26284860 |
2000-01-23 | 2027 | 26285112 |
2000-01-30 | 1940 | 26285364 |
2000-02-06 | 1928 | 26285616 |
... | ... | ... |
2022-05-15 | 1568 | 25840150 |
2022-05-22 | 1592 | 25840395 |
2022-05-29 | 1508 | 26578944 |
2022-06-05 | 1554 | 26579196 |
2022-06-12 | 1560 | 26579448 |
1167 rows × 2 columns
grpd_date = df.groupby('date').agg('sum')
grpd_date = grpd_date.rename(columns={'counts':'nr_deaths'})
# grpd_date['date'] = grpd_date.index
# grpd_date
temp = df_deaths.copy(deep=True)
df_deaths = temp.copy(deep=True)
df_deaths = df_deaths[df_deaths.Name == 'Österreich']
df_deaths
Datum | BundeslandID | Name | BestaetigteFaelleBundeslaender | Todesfaelle | Genesen | Hospitalisierung | Intensivstation | Testungen | TestungenPCR | TestungenAntigen | |
---|---|---|---|---|---|---|---|---|---|---|---|
9 | 2021-03-01T09:30:00+01:00 | 10 | Österreich | 460849 | 8574 | 432016 | 1353 | 290 | 15003345 | 5395666 | 9607679 |
19 | 2021-03-02T09:30:00+01:00 | 10 | Österreich | 462769 | 8605 | 433873 | 1427 | 296 | 15358139 | 5456284 | 9901855 |
29 | 2021-03-03T09:30:00+01:00 | 10 | Österreich | 465322 | 8625 | 435669 | 1415 | 313 | 15602870 | 5507472 | 10095398 |
39 | 2021-03-04T09:30:00+01:00 | 10 | Österreich | 467646 | 8652 | 437202 | 1425 | 302 | 15864120 | 5557591 | 10306529 |
49 | 2021-03-05T09:30:00+01:00 | 10 | Österreich | 470314 | 8669 | 439101 | 1421 | 326 | 16123615 | 5610644 | 10512971 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4809 | 2022-06-24T09:30:00+02:00 | 10 | Österreich | 4370852 | 18757 | 4271277 | 662 | 42 | 189894351 | 95918261 | 93976090 |
4819 | 2022-06-25T09:30:00+02:00 | 10 | Österreich | 4379764 | 18758 | 4276737 | 677 | 44 | 189984714 | 96003641 | 93981073 |
4829 | 2022-06-26T09:30:00+02:00 | 10 | Österreich | 4386857 | 18760 | 4281819 | 686 | 46 | 190048498 | 96065222 | 93983276 |
4839 | 2022-06-27T09:30:00+02:00 | 10 | Österreich | 4393255 | 18764 | 4286495 | 777 | 48 | 190108723 | 96122902 | 93985821 |
4849 | 2022-06-28T09:30:00+02:00 | 10 | Österreich | 4403444 | 18768 | 4293115 | 842 | 47 | 190225970 | 96219579 | 94006391 |
485 rows × 11 columns
df_deaths = df_deaths.rename(columns={'Datum':'formatted_date', 'Todesfaelle':'deaths'})
df_deaths = df_deaths[['formatted_date', 'deaths']]
df_deaths
formatted_date | deaths | |
---|---|---|
9 | 2021-03-01T09:30:00+01:00 | 8574 |
19 | 2021-03-02T09:30:00+01:00 | 8605 |
29 | 2021-03-03T09:30:00+01:00 | 8625 |
39 | 2021-03-04T09:30:00+01:00 | 8652 |
49 | 2021-03-05T09:30:00+01:00 | 8669 |
... | ... | ... |
4809 | 2022-06-24T09:30:00+02:00 | 18757 |
4819 | 2022-06-25T09:30:00+02:00 | 18758 |
4829 | 2022-06-26T09:30:00+02:00 | 18760 |
4839 | 2022-06-27T09:30:00+02:00 | 18764 |
4849 | 2022-06-28T09:30:00+02:00 | 18768 |
485 rows × 2 columns
df_deaths['formatted_date'] = [d.split("T")[0] for d in df_deaths['formatted_date']]
df_deaths['date'] = pd.to_datetime(df_deaths['formatted_date'])
# df_deaths['formatted_date'] = pd.to_datetime(df_deaths['formatted_date'])
# df_deaths['formatted_date'] = df_deaths['formatted_date'].dt.tz_localize('CET', utc=True)
# df_deaths['date'] = df_deaths['formatted_date'].dt.strftime('%Y-%m-%d')
df_deaths
formatted_date | deaths | date | |
---|---|---|---|
9 | 2021-03-01 | 8574 | 2021-03-01 |
19 | 2021-03-02 | 8605 | 2021-03-02 |
29 | 2021-03-03 | 8625 | 2021-03-03 |
39 | 2021-03-04 | 8652 | 2021-03-04 |
49 | 2021-03-05 | 8669 | 2021-03-05 |
... | ... | ... | ... |
4809 | 2022-06-24 | 18757 | 2022-06-24 |
4819 | 2022-06-25 | 18758 | 2022-06-25 |
4829 | 2022-06-26 | 18760 | 2022-06-26 |
4839 | 2022-06-27 | 18764 | 2022-06-27 |
4849 | 2022-06-28 | 18768 | 2022-06-28 |
485 rows × 3 columns
grpd_date_df_deaths = df_deaths.groupby('date').agg('sum')
grpd_date_df_deaths['deaths_per_day'] = grpd_date_df_deaths.diff()
# https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(21)02796-3/fulltext#seccestitle150
# austria has ratio of 1.33
# grpd_date_df_deaths['deaths_per_day_corrected'] = grpd_date_df_deaths.deaths_per_day * 1.33
grpd_date_df_deaths
deaths | deaths_per_day | |
---|---|---|
date | ||
2021-03-01 | 8574 | NaN |
2021-03-02 | 8605 | 31.0 |
2021-03-03 | 8625 | 20.0 |
2021-03-04 | 8652 | 27.0 |
2021-03-05 | 8669 | 17.0 |
... | ... | ... |
2022-06-24 | 18757 | 8.0 |
2022-06-25 | 18758 | 1.0 |
2022-06-26 | 18760 | 2.0 |
2022-06-27 | 18764 | 4.0 |
2022-06-28 | 18768 | 4.0 |
485 rows × 2 columns
fig = px.line(
x=grpd_date_df_deaths.index,
y=grpd_date_df_deaths.deaths_per_day,
title='Deaths per year'
)
fig.show()
total_df = pd.merge(grpd_date, grpd_date_df_deaths, how='left', on='date')
total_df
nr_deaths | conv_date | deaths | deaths_per_day | |
---|---|---|---|---|
date | ||||
2000-01-09 | 1867 | 26284608 | NaN | NaN |
2000-01-16 | 1902 | 26284860 | NaN | NaN |
2000-01-23 | 2027 | 26285112 | NaN | NaN |
2000-01-30 | 1940 | 26285364 | NaN | NaN |
2000-02-06 | 1928 | 26285616 | NaN | NaN |
... | ... | ... | ... | ... |
2022-05-15 | 1568 | 25840150 | 18303.0 | 0.0 |
2022-05-22 | 1592 | 25840395 | 18347.0 | 4.0 |
2022-05-29 | 1508 | 26578944 | 18651.0 | 0.0 |
2022-06-05 | 1554 | 26579196 | 18670.0 | 1.0 |
2022-06-12 | 1560 | 26579448 | 18697.0 | 3.0 |
1167 rows × 4 columns
total_df = total_df.rename(
columns={'deaths_per_day':'covid_deaths'})
total_df = total_df[['nr_deaths', 'covid_deaths']]
total_df
nr_deaths | covid_deaths | |
---|---|---|
date | ||
2000-01-09 | 1867 | NaN |
2000-01-16 | 1902 | NaN |
2000-01-23 | 2027 | NaN |
2000-01-30 | 1940 | NaN |
2000-02-06 | 1928 | NaN |
... | ... | ... |
2022-05-15 | 1568 | 0.0 |
2022-05-22 | 1592 | 4.0 |
2022-05-29 | 1508 | 0.0 |
2022-06-05 | 1554 | 1.0 |
2022-06-12 | 1560 | 3.0 |
1167 rows × 2 columns
total_df.describe()
nr_deaths | covid_deaths | |
---|---|---|
count | 1167.000000 | 67.000000 |
mean | 1513.545844 | 11.268657 |
std | 207.013936 | 10.300900 |
min | 1185.000000 | 0.000000 |
25% | 1392.000000 | 3.000000 |
50% | 1473.000000 | 6.000000 |
75% | 1578.000000 | 18.500000 |
max | 3953.000000 | 43.000000 |
upper_limit = total_df['nr_deaths'].quantile(0.99999)
lower_limit = total_df['nr_deaths'].quantile(0.00001)
new_df = total_df[(
total_df['nr_deaths'] <= upper_limit) & (
total_df['nr_deaths'] >= lower_limit)]
new_df
nr_deaths | covid_deaths | |
---|---|---|
date | ||
2000-01-09 | 1867 | NaN |
2000-01-16 | 1902 | NaN |
2000-01-23 | 2027 | NaN |
2000-01-30 | 1940 | NaN |
2000-02-06 | 1928 | NaN |
... | ... | ... |
2022-05-15 | 1568 | 0.0 |
2022-05-22 | 1592 | 4.0 |
2022-05-29 | 1508 | 0.0 |
2022-06-05 | 1554 | 1.0 |
2022-06-12 | 1560 | 3.0 |
1165 rows × 2 columns
final_df = new_df
only_covid = final_df[final_df.covid_deaths.notna()]
only_covid.shape
(67, 2)
# only if we want to limit all deaths to covid time period
# final_df = only_covid
fig = px.line(
x=final_df.index,
y=final_df.nr_deaths,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=final_df.index, y=final_df.covid_deaths,
name='Covid deaths'))
fig.show()
trace1 = go.Scatter(
x=final_df.index,
y=final_df.nr_deaths,
mode='lines',
line=dict(width=1.5))
trace2 = go.Scatter(
x=final_df.index, y=final_df.covid_deaths,
name='Covid deaths')
frames=[
dict(
data=[
dict(
type = 'scatter',
x=final_df.index[:k],
y=final_df.nr_deaths[:k]),
dict(
type = 'scatter',
x=final_df.index[:k],
y=final_df.covid_deaths[:k])]
)
for k in range(0, len(final_df))]
layout = go.Layout(width=1000,
height=600,
showlegend=False,
hovermode='x unified',
updatemenus=[
dict(
type='buttons', showactive=False,
y=1.05,
x=1.15,
xanchor='right',
yanchor='top',
pad=dict(t=0, r=10),
buttons=[dict(label='Build line',
method='animate',
args=[None,
dict(frame=dict(duration=2,
redraw=False),
transition=dict(duration=0),
fromcurrent=True,
mode='immediate')]
)]
),
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=[{"yaxis.type": "linear"}],
label="LINEAR",
method="relayout"
),
dict(
args=[{"yaxis.type": "log"}],
label="LOG",
method="relayout"
)
]),
),
]
)
# layout.update(xaxis =dict(range=['2020-03-16', '2020-06-13'], autorange=False),
# yaxis =dict(range=[0, 35000], autorange=False));
fig = go.Figure(data=[trace1, trace2], frames=frames, layout=layout)
# fig.show()
Ordinar Least Squares
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
trendline="ols",
trendline_color_override="red",
opacity=.5,
title='Deaths per year'
)
fig.show()
results = px.get_trendline_results(fig)
results = results.iloc[0]["px_fit_results"].summary()
print(results)
# regression params not available for lowess
OLS Regression Results ============================================================================== Dep. Variable: y R-squared: 0.130 Model: OLS Adj. R-squared: 0.130 Method: Least Squares F-statistic: 174.4 Date: Thu, 30 Jun 2022 Prob (F-statistic): 3.34e-37 Time: 15:41:46 Log-Likelihood: -7709.5 No. Observations: 1165 AIC: 1.542e+04 Df Residuals: 1163 BIC: 1.543e+04 Df Model: 1 Covariance Type: nonrobust ============================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------ const 1065.5178 34.203 31.152 0.000 998.410 1132.625 x1 3.43e-07 2.6e-08 13.206 0.000 2.92e-07 3.94e-07 ============================================================================== Omnibus: 799.913 Durbin-Watson: 0.576 Prob(Omnibus): 0.000 Jarque-Bera (JB): 16455.842 Skew: 2.882 Prob(JB): 0.00 Kurtosis: 20.487 Cond. No. 8.49e+09 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The condition number is large, 8.49e+09. This might indicate that there are strong multicollinearity or other numerical problems.
x=final_df.index.values
y=final_df.nr_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
model = LinearRegression()
model.fit(x, y)
x_range_ordinal = np.linspace(x.min(), x.max(), len(y))
y_range = model.predict(x_range_ordinal.reshape(-1, 1))
len(x_range_ordinal), len(y_range)
(1165, 1165)
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
opacity=.5,
# trendline='ols', trendline_color_override='darkblue',
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=final_df.index,
y=y_range,
name='Regression Fit'))
fig.show()
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
trendline="lowess",
trendline_color_override="red",
trendline_options=dict(frac=0.1),
opacity=.5,
title='Deaths per year'
)
fig.show()
# regression params not available for lowess
poly_degree = 10
y = final_df.nr_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
# just for checking with sklearn implementation
poly = PolynomialFeatures(degree=poly_degree, include_bias=False)
poly_features = poly.fit_transform(x)
poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y)
# y_range_poly = poly_reg_model.predict(
# poly_pred_x)
# print(poly_reg_model.intercept_)
# print(poly_reg_model.coef_)
###
#-----
fitted_params = np.polyfit(np.arange(0, len(y)), y, poly_degree )
polynomials = np.poly1d(fitted_params)
derivatives = np.polyder(polynomials)
y_value_at_point = polynomials(x).flatten()
slope_at_point = np.polyval(derivatives, np.arange(0, len(y)))
#-----
# x_range_ordinal_poly = np.linspace(x.min(), x.max(), len(y))
# poly_pred_x = poly.fit_transform(x_range_ordinal_poly.reshape(-1, 1))
print(f'''
x: {len(x), x},
y: {len(y), y},
''')
print(f'''
fittedparams: {fitted_params, fitted_params},
derivs: {derivatives},
y vals at point: {y_value_at_point, len(y_value_at_point)},
slope at point: {slope_at_point}''')
x: (1165, array([[ 0], [ 1], [ 2], ..., [1162], [1163], [1164]])), y: (1165, array([1867, 1902, 2027, ..., 1508, 1554, 1560])), fittedparams: (array([ 1.12001971e-24, -7.91989854e-21, 2.36011784e-17, -3.88614603e-14, 3.88520208e-11, -2.43476311e-08, 9.50992557e-06, -2.22093300e-03, 2.82908130e-01, -1.62807870e+01, 1.71766195e+03]), array([ 1.12001971e-24, -7.91989854e-21, 2.36011784e-17, -3.88614603e-14, 3.88520208e-11, -2.43476311e-08, 9.50992557e-06, -2.22093300e-03, 2.82908130e-01, -1.62807870e+01, 1.71766195e+03])), derivs: 9 8 7 6 5 1.12e-23 x - 7.128e-20 x + 1.888e-16 x - 2.72e-13 x + 2.331e-10 x 4 3 2 - 1.217e-07 x + 3.804e-05 x - 0.006663 x + 0.5658 x - 16.28, y vals at point: (array([1717.66194592, 1701.66185556, 1686.21438827, ..., 1738.84192024, 1735.21864265, 1731.40871437]), 1165), slope at point: [-16.28078705 -15.72159567 -15.17550335 ... -3.53128921 -3.71593226 -3.90459934]
def draw_slope_line_at_point(fig, ind, x, y, slope_at_point, verbose=False):
"""Plot a line from an index at a specific point for x values, y values and their slopes"""
y_low = (x[0] - x[ind]) * slope_at_point[ind] + y[ind]
y_high = (x[-1] - x[ind]) * slope_at_point[ind] + y[ind]
x_vals = [x[0], x[-1]]
y_vals = [y_low, y_high]
if verbose:
print((x[0] - x[ind]))
print(x[ind], x_vals, y_vals, y[ind],slope_at_point[ind])
fig.add_trace(
go.Scatter(
x=x_vals,
y=y_vals,
name="Tangent at point",
line = dict(color='orange', width=2, dash='dash'),
)
)
return x_vals, y_vals
fig = px.scatter(
x=np.arange(0, len(y)),
#x=final_df.index,
y=final_df.nr_deaths,
opacity=.3,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit 2'))
# Replace x axis ticks with dates instead numbers
# fig.update_xaxes(
# rangeslider_visible=True,
# ticktext=final_df.index.strftime('%Y-%m-%d')[::10],
# tickvals=np.arange(0, len(y))[::10],
#tickformatstops not working with ticktextZ
# )
for pt in [750 ,1080]:
# for pt in [31]:
draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
#x=final_df.index,
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
fig.add_annotation(x=pt, y=y_value_at_point[pt],
text=f'''Slope: {slope_at_point[pt]:.2f}\t {final_df.index.strftime('%Y-%m-%d')[pt]}''',
showarrow=True,
arrowhead=1)
fig.update_layout(
hovermode='x unified',
)
fig.show()
fig = px.scatter(
x=only_covid.index,
y=only_covid.covid_deaths,
trendline="ols",
trendline_color_override="red",
opacity=.5,
title='Deaths per year'
)
fig.show()
fig = px.scatter(
x=only_covid.index,
y=only_covid.covid_deaths,
trendline="lowess",
trendline_color_override="red",
trendline_options=dict(frac=0.1),
opacity=.5,
title='Deaths per year'
)
fig.show()
# regression params not available for lowess
poly_degree = 10
y = only_covid.covid_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
# just for checking with sklearn implementation
poly = PolynomialFeatures(degree=poly_degree, include_bias=False)
poly_features = poly.fit_transform(x)
poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y)
#-----
fitted_params = np.polyfit(np.arange(0, len(y)), y, poly_degree )
polynomials = np.poly1d(fitted_params)
derivatives = np.polyder(polynomials)
y_value_at_point = polynomials(x).flatten()
slope_at_point = np.polyval(derivatives, np.arange(0, len(y)))
#-----
# x_range_ordinal_poly = np.linspace(x.min(), x.max(), len(y))
# poly_pred_x = poly.fit_transform(x_range_ordinal_poly.reshape(-1, 1))
# print(f'''
# x: {len(x), x},
# y: {len(y), y},
# ''')
print(f'''
fittedparams: {fitted_params, fitted_params},
derivs: {derivatives},
y vals at point: {y_value_at_point, len(y_value_at_point)},
slope at point: {slope_at_point}''')
fittedparams: (array([-3.29416755e-13, 1.26451941e-10, -2.02479035e-08, 1.77003587e-06, -9.26507045e-05, 2.99527695e-03, -5.96424960e-02, 7.09907843e-01, -4.66671787e+00, 1.25294601e+01, 1.39709027e+01]), array([-3.29416755e-13, 1.26451941e-10, -2.02479035e-08, 1.77003587e-06, -9.26507045e-05, 2.99527695e-03, -5.96424960e-02, 7.09907843e-01, -4.66671787e+00, 1.25294601e+01, 1.39709027e+01])), derivs: 9 8 7 6 5 -3.294e-12 x + 1.138e-09 x - 1.62e-07 x + 1.239e-05 x - 0.0005559 x 4 3 2 + 0.01498 x - 0.2386 x + 2.13 x - 9.333 x + 12.53, y vals at point: (array([13.97090266, 22.4868146 , 25.17807482, 24.55934231, 22.3022525 , 19.45536845, 16.62054399, 14.09204724, 11.96414985, 10.21228534, 8.75231638, 7.4819266 , 6.30766423, 5.1607125 , 4.00404452, 2.83323543, 1.67285297, 0.57002568, -0.41350281, -1.20978868, -1.75276229, -1.98536217, -1.86531298, -1.36929271, -0.49537231, 0.73627198, 2.2842772 , 4.0884688 , 6.0734584 , 8.1531869 , 10.23610819, 12.23069481, 14.05094648, 15.62159384, 16.88271135, 17.7934848 , 18.33491844, 18.51131322, 18.35040002, 17.90206911, 17.23569723, 16.43613666, 15.59849449, 14.82189396, 14.20247212, 13.82592783, 13.75999048, 14.04723062, 14.69867926, 15.68875999, 16.95206751, 18.38254582, 19.83562782, 21.13389493, 22.07679833, 22.45495252, 22.06946502, 20.75670213, 18.41880923, 15.06020257, 10.83012866, 6.07124354, 1.37399871, -2.36356978, -3.87126782, -1.4382456 , 7.1582941 ]), 67), slope at point: [ 1.25294601e+01 5.10161060e+00 6.95528608e-01 -1.65803685e+00 -2.68430654e+00 -2.91145774e+00 -2.71092751e+00 -2.33169590e+00 -1.92917152e+00 -1.58926187e+00 -1.34817221e+00 -1.20843916e+00 -1.15166889e+00 -1.14841469e+00 -1.16559480e+00 -1.17181845e+00 -1.14095682e+00 -1.05426499e+00 -9.01332028e-01 -6.80108222e-01 -3.96231860e-01 -6.18522935e-02 3.05878322e-01 6.86495374e-01 1.05796699e+00 1.39838761e+00 1.68753311e+00 1.90820892e+00 2.04733993e+00 2.09676685e+00 2.05372861e+00 1.92102383e+00 1.70685712e+00 1.42438670e+00 1.09100042e+00 7.27355606e-01 3.56226146e-01 1.20639674e-03 -3.14673055e-01 -5.70358814e-01 -7.48155426e-01 -8.35009954e-01 -8.23636397e-01 -7.13416735e-01 -5.11017275e-01 -2.30662127e-01 1.05990062e-01 4.70414455e-01 8.28275109e-01 1.14100051e+00 1.36790420e+00 1.46886486e+00 1.40756859e+00 1.15530372e+00 6.95284288e-01 2.74633463e-02 -8.26218782e-01 -1.81622405e+00 -2.85953514e+00 -3.83361391e+00 -4.57033195e+00 -4.85002765e+00 -4.39586760e+00 -2.86871468e+00 1.37268660e-01 5.09802684e+00 1.25617435e+01]
fig = px.scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
opacity=.5,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit 2'))
for pt in [31]:
draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
#x=final_df.index,
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
fig.add_annotation(
x=pt,
y=y_value_at_point[pt],
text=f'''Slope: {slope_at_point[pt]:.2f}\t {only_covid.index.strftime('%Y-%m-%d')[pt]}''',
showarrow=True,
arrowhead=1)
fig.update_layout(
hovermode='x unified',
)
fig.show()
traces = []
animation_dicts = dict()
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
name='Covid deaths',
mode='lines',
opacity=.5,
line=dict(width=1.5)
))
# traces.append(
# go.Scatter(
# x=np.arange(0, len(y)),
# y=only_covid.covid_deaths,
# mode='lines',
# opacity=.5,
# line={'shape': 'spline', 'smoothing': 1.3}
# ))
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
name='Covid deaths',
mode='markers',
opacity=.5,
line=dict(width=1)
))
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit',
mode='lines',
opacity=.5,
line=dict(width=1.5)
))
for pt in np.arange(0, len(y)):#[31, 60]:
x_vals, y_vals = draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
animation_dicts[pt]= [x_vals, y_vals]
# traces.append(
# go.Scatter(
# x=x_vals,
# y=y_vals,
# mode='lines',
# opacity=.4,
# line=dict(width=1.5)))
# fig.add_annotation(
# x=pt,
# y=y_value_at_point[pt],
# text=f'''Slope: {slope_at_point[pt]:.2f}\t {only_covid.index.strftime('%Y-%m-%d')[pt]}''',
# showarrow=True,
# arrowhead=1)
frame_data = []
slider_steps = []
for k in range(0, len(final_df)):
# frame_data.append(
# dict(data=
# [dict(
# type = 'scatter',
# x=np.arange(0, len(y))[:k],
# y=only_covid.covid_deaths[:k]
# )]
# )
# )
# add slope lines
if k in animation_dicts.keys():
frame_data.append(
dict(data=
[dict(
type = 'scatter',
x=animation_dicts[k][0],
y=animation_dicts[k][1],
mode='lines',
line={'dash': 'dash', 'color': 'green'}
)]
)
)
slider_steps.append(
{"args": [
frame_data[k],
{"frame": {"duration": 300, "redraw": False},
"mode": "immediate",
"transition": {"duration": 300}}
],
"label": k,
"method": "animate"}
)
all_frames = frame_data
frames=all_frames
# dict(
# data=[
# dict(
# type = 'scatter',
# x=np.arange(0, len(y))[:k],
# y=only_covid.covid_deaths[:k]
# ),
# dict(
# type = 'scatter',
# x=animation_dicts[k][0],
# y=animation_dicts[k][1]
# ),
# ]
# )
# for k in range(0, len(final_df))
sliders_dict = {
"active": 0,
"yanchor": "top",
"xanchor": "left",
"currentvalue": {
"font": {"size": 20},
"prefix": "Week:",
"visible": True,
"xanchor": "right"
},
"transition": {"duration": 300, "easing": "cubic-in-out"},
"pad": {"b": 10, "t": 50},
"len": 0.9,
"x": 0.1,
"y": 0,
"steps": slider_steps
}
layout = go.Layout(
xaxis={"range": [0, len(y)], "title": "weeks"},
sliders=[sliders_dict],
# showlegend=False,
hovermode='x unified',
updatemenus=[
dict(
type='buttons',
buttons=[
dict(
label='Show tangents',
method='animate',
args=[None,
dict(frame=dict(duration=200,
redraw=False),
transition=dict(duration=0),
fromcurrent=True,
mode='immediate')
]),
{
"args": [[None], {"frame": {"duration": 0, "redraw": False},
"mode": "immediate",
"transition": {"duration": 0}}],
"label": "Pause",
"method": "animate"
}
],
direction="left",
pad= {"r": 10, "t": 87},
showactive= False,
x=0.1,
xanchor= "right",
y= 0,
yanchor= "top"
)
]
)
fig = go.Figure(
data=traces,
frames=frames,
layout=layout)
fig.show()
# pio.write_html(fig, file='figure.html', auto_open=True)
A proper interpretation would need a lot more research. As stated above this notebook as also technical implementational goals instead of merely analysing the deaths of people in Austria.
I, as an average educated person in this matter witness that