The data is used from data.gv.at which is Austrias official open data source.
More about it here
The used datasets are from:
This project has multiple goals.
import datetime
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import plotly.io as pio
# for rendering plot static
# pio.renderers.default = "svg"
data_file_name = 'data/OGD_gest_kalwo_GEST_KALWOCHE_100.csv'
covid_deaths_data_file_name = 'data/timeline-faelle-bundeslaender.csv'
df = pd.read_csv(data_file_name, sep=';')
df_deaths = pd.read_csv(covid_deaths_data_file_name, sep=';')
df
| C-KALWOCHE-0 | C-B00-0 | C-ALTERGR65-0 | C-C11-0 | F-ANZ-1 | |
|---|---|---|---|---|---|
| 0 | KALW-200001 | B00-1 | ALTERSGR65-1 | C11-1 | 8 |
| 1 | KALW-200001 | B00-1 | ALTERSGR65-1 | C11-2 | 2 |
| 2 | KALW-200001 | B00-1 | ALTERSGR65-2 | C11-1 | 25 |
| 3 | KALW-200001 | B00-1 | ALTERSGR65-2 | C11-2 | 33 |
| 4 | KALW-200001 | B00-2 | ALTERSGR65-1 | C11-1 | 7 |
| ... | ... | ... | ... | ... | ... |
| 41998 | KALW-202223 | B00-8 | ALTERSGR65-2 | C11-2 | 28 |
| 41999 | KALW-202223 | B00-9 | ALTERSGR65-1 | C11-1 | 24 |
| 42000 | KALW-202223 | B00-9 | ALTERSGR65-1 | C11-2 | 13 |
| 42001 | KALW-202223 | B00-9 | ALTERSGR65-2 | C11-1 | 125 |
| 42002 | KALW-202223 | B00-9 | ALTERSGR65-2 | C11-2 | 159 |
42003 rows × 5 columns
df_deaths
| Datum | BundeslandID | Name | BestaetigteFaelleBundeslaender | Todesfaelle | Genesen | Hospitalisierung | Intensivstation | Testungen | TestungenPCR | TestungenAntigen | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-03-01T09:30:00+01:00 | 1 | Burgenland | 12657 | 239 | 11701 | 43 | 10 | 638575 | 155435 | 483140 |
| 1 | 2021-03-01T09:30:00+01:00 | 2 | Kärnten | 29225 | 683 | 27316 | 96 | 15 | 675557 | 217933 | 457624 |
| 2 | 2021-03-01T09:30:00+01:00 | 3 | Niederösterreich | 74538 | 1350 | 67900 | 349 | 82 | 3400756 | 1141984 | 2258772 |
| 3 | 2021-03-01T09:30:00+01:00 | 4 | Oberösterreich | 86409 | 1515 | 82567 | 120 | 20 | 2162517 | 546777 | 1615740 |
| 4 | 2021-03-01T09:30:00+01:00 | 5 | Salzburg | 37327 | 493 | 35318 | 71 | 15 | 823353 | 274598 | 548755 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4845 | 2022-06-28T09:30:00+02:00 | 6 | Steiermark | 561729 | 3399 | 551626 | 102 | 8 | 21098196 | 5816694 | 15281502 |
| 4846 | 2022-06-28T09:30:00+02:00 | 7 | Tirol | 367338 | 939 | 361988 | 77 | 2 | 9590745 | 3845911 | 5744834 |
| 4847 | 2022-06-28T09:30:00+02:00 | 8 | Vorarlberg | 211284 | 549 | 208983 | 20 | 2 | 7088523 | 1543803 | 5544720 |
| 4848 | 2022-06-28T09:30:00+02:00 | 9 | Wien | 955399 | 4028 | 914608 | 199 | 23 | 65808360 | 58193874 | 7614486 |
| 4849 | 2022-06-28T09:30:00+02:00 | 10 | Österreich | 4403444 | 18768 | 4293115 | 842 | 47 | 190225970 | 96219579 | 94006391 |
4850 rows × 11 columns
# profile = ProfileReport(df, title="Pandas Profiling Report")
# profile2 = ProfileReport(df_deaths, title="Pandas Profiling Report")
# profile2
df = df.rename(columns={"C-KALWOCHE-0": "cal_week", "C-B00-0": "state", "C-C11-0": "gender", "F-ANZ-1": "counts" })
df['year'] = [d.split("-")[1][:4] for d in df['cal_week']]
df['cal_week'] = [d.split("-")[1][4:] for d in df['cal_week']]
df['formatted_date'] = df.year.astype(str) + df.cal_week.astype(str) + '0'
df['date'] = pd.to_datetime(df['formatted_date'], format='%Y%W%w')
# df['datetime'] = pd.to_datetime(df.year.astype(str) + '-' +
# df.cal_week.astype(str) + '-1' , format="%Y-%W-%w").dt.strftime('%Y-%W')
df['conv_date']= df.date.map(datetime.datetime.toordinal)
df
| cal_week | state | C-ALTERGR65-0 | gender | counts | year | formatted_date | date | conv_date | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 01 | B00-1 | ALTERSGR65-1 | C11-1 | 8 | 2000 | 2000010 | 2000-01-09 | 730128 |
| 1 | 01 | B00-1 | ALTERSGR65-1 | C11-2 | 2 | 2000 | 2000010 | 2000-01-09 | 730128 |
| 2 | 01 | B00-1 | ALTERSGR65-2 | C11-1 | 25 | 2000 | 2000010 | 2000-01-09 | 730128 |
| 3 | 01 | B00-1 | ALTERSGR65-2 | C11-2 | 33 | 2000 | 2000010 | 2000-01-09 | 730128 |
| 4 | 01 | B00-2 | ALTERSGR65-1 | C11-1 | 7 | 2000 | 2000010 | 2000-01-09 | 730128 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 41998 | 23 | B00-8 | ALTERSGR65-2 | C11-2 | 28 | 2022 | 2022230 | 2022-06-12 | 738318 |
| 41999 | 23 | B00-9 | ALTERSGR65-1 | C11-1 | 24 | 2022 | 2022230 | 2022-06-12 | 738318 |
| 42000 | 23 | B00-9 | ALTERSGR65-1 | C11-2 | 13 | 2022 | 2022230 | 2022-06-12 | 738318 |
| 42001 | 23 | B00-9 | ALTERSGR65-2 | C11-1 | 125 | 2022 | 2022230 | 2022-06-12 | 738318 |
| 42002 | 23 | B00-9 | ALTERSGR65-2 | C11-2 | 159 | 2022 | 2022230 | 2022-06-12 | 738318 |
42003 rows × 9 columns
df.groupby('date').agg('sum')
| counts | conv_date | |
|---|---|---|
| date | ||
| 2000-01-09 | 1867 | 26284608 |
| 2000-01-16 | 1902 | 26284860 |
| 2000-01-23 | 2027 | 26285112 |
| 2000-01-30 | 1940 | 26285364 |
| 2000-02-06 | 1928 | 26285616 |
| ... | ... | ... |
| 2022-05-15 | 1568 | 25840150 |
| 2022-05-22 | 1592 | 25840395 |
| 2022-05-29 | 1508 | 26578944 |
| 2022-06-05 | 1554 | 26579196 |
| 2022-06-12 | 1560 | 26579448 |
1167 rows × 2 columns
grpd_date = df.groupby('date').agg('sum')
grpd_date = grpd_date.rename(columns={'counts':'nr_deaths'})
# grpd_date['date'] = grpd_date.index
# grpd_date
temp = df_deaths.copy(deep=True)
df_deaths = temp.copy(deep=True)
df_deaths = df_deaths[df_deaths.Name == 'Österreich']
df_deaths
| Datum | BundeslandID | Name | BestaetigteFaelleBundeslaender | Todesfaelle | Genesen | Hospitalisierung | Intensivstation | Testungen | TestungenPCR | TestungenAntigen | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 2021-03-01T09:30:00+01:00 | 10 | Österreich | 460849 | 8574 | 432016 | 1353 | 290 | 15003345 | 5395666 | 9607679 |
| 19 | 2021-03-02T09:30:00+01:00 | 10 | Österreich | 462769 | 8605 | 433873 | 1427 | 296 | 15358139 | 5456284 | 9901855 |
| 29 | 2021-03-03T09:30:00+01:00 | 10 | Österreich | 465322 | 8625 | 435669 | 1415 | 313 | 15602870 | 5507472 | 10095398 |
| 39 | 2021-03-04T09:30:00+01:00 | 10 | Österreich | 467646 | 8652 | 437202 | 1425 | 302 | 15864120 | 5557591 | 10306529 |
| 49 | 2021-03-05T09:30:00+01:00 | 10 | Österreich | 470314 | 8669 | 439101 | 1421 | 326 | 16123615 | 5610644 | 10512971 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4809 | 2022-06-24T09:30:00+02:00 | 10 | Österreich | 4370852 | 18757 | 4271277 | 662 | 42 | 189894351 | 95918261 | 93976090 |
| 4819 | 2022-06-25T09:30:00+02:00 | 10 | Österreich | 4379764 | 18758 | 4276737 | 677 | 44 | 189984714 | 96003641 | 93981073 |
| 4829 | 2022-06-26T09:30:00+02:00 | 10 | Österreich | 4386857 | 18760 | 4281819 | 686 | 46 | 190048498 | 96065222 | 93983276 |
| 4839 | 2022-06-27T09:30:00+02:00 | 10 | Österreich | 4393255 | 18764 | 4286495 | 777 | 48 | 190108723 | 96122902 | 93985821 |
| 4849 | 2022-06-28T09:30:00+02:00 | 10 | Österreich | 4403444 | 18768 | 4293115 | 842 | 47 | 190225970 | 96219579 | 94006391 |
485 rows × 11 columns
df_deaths = df_deaths.rename(columns={'Datum':'formatted_date', 'Todesfaelle':'deaths'})
df_deaths = df_deaths[['formatted_date', 'deaths']]
df_deaths
| formatted_date | deaths | |
|---|---|---|
| 9 | 2021-03-01T09:30:00+01:00 | 8574 |
| 19 | 2021-03-02T09:30:00+01:00 | 8605 |
| 29 | 2021-03-03T09:30:00+01:00 | 8625 |
| 39 | 2021-03-04T09:30:00+01:00 | 8652 |
| 49 | 2021-03-05T09:30:00+01:00 | 8669 |
| ... | ... | ... |
| 4809 | 2022-06-24T09:30:00+02:00 | 18757 |
| 4819 | 2022-06-25T09:30:00+02:00 | 18758 |
| 4829 | 2022-06-26T09:30:00+02:00 | 18760 |
| 4839 | 2022-06-27T09:30:00+02:00 | 18764 |
| 4849 | 2022-06-28T09:30:00+02:00 | 18768 |
485 rows × 2 columns
df_deaths['formatted_date'] = [d.split("T")[0] for d in df_deaths['formatted_date']]
df_deaths['date'] = pd.to_datetime(df_deaths['formatted_date'])
# df_deaths['formatted_date'] = pd.to_datetime(df_deaths['formatted_date'])
# df_deaths['formatted_date'] = df_deaths['formatted_date'].dt.tz_localize('CET', utc=True)
# df_deaths['date'] = df_deaths['formatted_date'].dt.strftime('%Y-%m-%d')
df_deaths
| formatted_date | deaths | date | |
|---|---|---|---|
| 9 | 2021-03-01 | 8574 | 2021-03-01 |
| 19 | 2021-03-02 | 8605 | 2021-03-02 |
| 29 | 2021-03-03 | 8625 | 2021-03-03 |
| 39 | 2021-03-04 | 8652 | 2021-03-04 |
| 49 | 2021-03-05 | 8669 | 2021-03-05 |
| ... | ... | ... | ... |
| 4809 | 2022-06-24 | 18757 | 2022-06-24 |
| 4819 | 2022-06-25 | 18758 | 2022-06-25 |
| 4829 | 2022-06-26 | 18760 | 2022-06-26 |
| 4839 | 2022-06-27 | 18764 | 2022-06-27 |
| 4849 | 2022-06-28 | 18768 | 2022-06-28 |
485 rows × 3 columns
grpd_date_df_deaths = df_deaths.groupby('date').agg('sum')
grpd_date_df_deaths['deaths_per_day'] = grpd_date_df_deaths.diff()
# https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(21)02796-3/fulltext#seccestitle150
# austria has ratio of 1.33
# grpd_date_df_deaths['deaths_per_day_corrected'] = grpd_date_df_deaths.deaths_per_day * 1.33
grpd_date_df_deaths
| deaths | deaths_per_day | |
|---|---|---|
| date | ||
| 2021-03-01 | 8574 | NaN |
| 2021-03-02 | 8605 | 31.0 |
| 2021-03-03 | 8625 | 20.0 |
| 2021-03-04 | 8652 | 27.0 |
| 2021-03-05 | 8669 | 17.0 |
| ... | ... | ... |
| 2022-06-24 | 18757 | 8.0 |
| 2022-06-25 | 18758 | 1.0 |
| 2022-06-26 | 18760 | 2.0 |
| 2022-06-27 | 18764 | 4.0 |
| 2022-06-28 | 18768 | 4.0 |
485 rows × 2 columns
fig = px.line(
x=grpd_date_df_deaths.index,
y=grpd_date_df_deaths.deaths_per_day,
title='Deaths per year'
)
fig.show()
total_df = pd.merge(grpd_date, grpd_date_df_deaths, how='left', on='date')
total_df
| nr_deaths | conv_date | deaths | deaths_per_day | |
|---|---|---|---|---|
| date | ||||
| 2000-01-09 | 1867 | 26284608 | NaN | NaN |
| 2000-01-16 | 1902 | 26284860 | NaN | NaN |
| 2000-01-23 | 2027 | 26285112 | NaN | NaN |
| 2000-01-30 | 1940 | 26285364 | NaN | NaN |
| 2000-02-06 | 1928 | 26285616 | NaN | NaN |
| ... | ... | ... | ... | ... |
| 2022-05-15 | 1568 | 25840150 | 18303.0 | 0.0 |
| 2022-05-22 | 1592 | 25840395 | 18347.0 | 4.0 |
| 2022-05-29 | 1508 | 26578944 | 18651.0 | 0.0 |
| 2022-06-05 | 1554 | 26579196 | 18670.0 | 1.0 |
| 2022-06-12 | 1560 | 26579448 | 18697.0 | 3.0 |
1167 rows × 4 columns
total_df = total_df.rename(
columns={'deaths_per_day':'covid_deaths'})
total_df = total_df[['nr_deaths', 'covid_deaths']]
total_df
| nr_deaths | covid_deaths | |
|---|---|---|
| date | ||
| 2000-01-09 | 1867 | NaN |
| 2000-01-16 | 1902 | NaN |
| 2000-01-23 | 2027 | NaN |
| 2000-01-30 | 1940 | NaN |
| 2000-02-06 | 1928 | NaN |
| ... | ... | ... |
| 2022-05-15 | 1568 | 0.0 |
| 2022-05-22 | 1592 | 4.0 |
| 2022-05-29 | 1508 | 0.0 |
| 2022-06-05 | 1554 | 1.0 |
| 2022-06-12 | 1560 | 3.0 |
1167 rows × 2 columns
total_df.describe()
| nr_deaths | covid_deaths | |
|---|---|---|
| count | 1167.000000 | 67.000000 |
| mean | 1513.545844 | 11.268657 |
| std | 207.013936 | 10.300900 |
| min | 1185.000000 | 0.000000 |
| 25% | 1392.000000 | 3.000000 |
| 50% | 1473.000000 | 6.000000 |
| 75% | 1578.000000 | 18.500000 |
| max | 3953.000000 | 43.000000 |
upper_limit = total_df['nr_deaths'].quantile(0.99999)
lower_limit = total_df['nr_deaths'].quantile(0.00001)
new_df = total_df[(
total_df['nr_deaths'] <= upper_limit) & (
total_df['nr_deaths'] >= lower_limit)]
new_df
| nr_deaths | covid_deaths | |
|---|---|---|
| date | ||
| 2000-01-09 | 1867 | NaN |
| 2000-01-16 | 1902 | NaN |
| 2000-01-23 | 2027 | NaN |
| 2000-01-30 | 1940 | NaN |
| 2000-02-06 | 1928 | NaN |
| ... | ... | ... |
| 2022-05-15 | 1568 | 0.0 |
| 2022-05-22 | 1592 | 4.0 |
| 2022-05-29 | 1508 | 0.0 |
| 2022-06-05 | 1554 | 1.0 |
| 2022-06-12 | 1560 | 3.0 |
1165 rows × 2 columns
final_df = new_df
only_covid = final_df[final_df.covid_deaths.notna()]
only_covid.shape
(67, 2)
# only if we want to limit all deaths to covid time period
# final_df = only_covid
fig = px.line(
x=final_df.index,
y=final_df.nr_deaths,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=final_df.index, y=final_df.covid_deaths,
name='Covid deaths'))
fig.show()
trace1 = go.Scatter(
x=final_df.index,
y=final_df.nr_deaths,
mode='lines',
line=dict(width=1.5))
trace2 = go.Scatter(
x=final_df.index, y=final_df.covid_deaths,
name='Covid deaths')
frames=[
dict(
data=[
dict(
type = 'scatter',
x=final_df.index[:k],
y=final_df.nr_deaths[:k]),
dict(
type = 'scatter',
x=final_df.index[:k],
y=final_df.covid_deaths[:k])]
)
for k in range(0, len(final_df))]
layout = go.Layout(width=1000,
height=600,
showlegend=False,
hovermode='x unified',
updatemenus=[
dict(
type='buttons', showactive=False,
y=1.05,
x=1.15,
xanchor='right',
yanchor='top',
pad=dict(t=0, r=10),
buttons=[dict(label='Build line',
method='animate',
args=[None,
dict(frame=dict(duration=2,
redraw=False),
transition=dict(duration=0),
fromcurrent=True,
mode='immediate')]
)]
),
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=[{"yaxis.type": "linear"}],
label="LINEAR",
method="relayout"
),
dict(
args=[{"yaxis.type": "log"}],
label="LOG",
method="relayout"
)
]),
),
]
)
# layout.update(xaxis =dict(range=['2020-03-16', '2020-06-13'], autorange=False),
# yaxis =dict(range=[0, 35000], autorange=False));
fig = go.Figure(data=[trace1, trace2], frames=frames, layout=layout)
# fig.show()
Ordinar Least Squares
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
trendline="ols",
trendline_color_override="red",
opacity=.5,
title='Deaths per year'
)
fig.show()
results = px.get_trendline_results(fig)
results = results.iloc[0]["px_fit_results"].summary()
print(results)
# regression params not available for lowess
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.130
Model: OLS Adj. R-squared: 0.130
Method: Least Squares F-statistic: 174.4
Date: Thu, 30 Jun 2022 Prob (F-statistic): 3.34e-37
Time: 15:41:46 Log-Likelihood: -7709.5
No. Observations: 1165 AIC: 1.542e+04
Df Residuals: 1163 BIC: 1.543e+04
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1065.5178 34.203 31.152 0.000 998.410 1132.625
x1 3.43e-07 2.6e-08 13.206 0.000 2.92e-07 3.94e-07
==============================================================================
Omnibus: 799.913 Durbin-Watson: 0.576
Prob(Omnibus): 0.000 Jarque-Bera (JB): 16455.842
Skew: 2.882 Prob(JB): 0.00
Kurtosis: 20.487 Cond. No. 8.49e+09
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.49e+09. This might indicate that there are
strong multicollinearity or other numerical problems.
x=final_df.index.values
y=final_df.nr_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
model = LinearRegression()
model.fit(x, y)
x_range_ordinal = np.linspace(x.min(), x.max(), len(y))
y_range = model.predict(x_range_ordinal.reshape(-1, 1))
len(x_range_ordinal), len(y_range)
(1165, 1165)
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
opacity=.5,
# trendline='ols', trendline_color_override='darkblue',
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=final_df.index,
y=y_range,
name='Regression Fit'))
fig.show()
fig = px.scatter(
x=final_df.index,
y=final_df.nr_deaths,
trendline="lowess",
trendline_color_override="red",
trendline_options=dict(frac=0.1),
opacity=.5,
title='Deaths per year'
)
fig.show()
# regression params not available for lowess
poly_degree = 10
y = final_df.nr_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
# just for checking with sklearn implementation
poly = PolynomialFeatures(degree=poly_degree, include_bias=False)
poly_features = poly.fit_transform(x)
poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y)
# y_range_poly = poly_reg_model.predict(
# poly_pred_x)
# print(poly_reg_model.intercept_)
# print(poly_reg_model.coef_)
###
#-----
fitted_params = np.polyfit(np.arange(0, len(y)), y, poly_degree )
polynomials = np.poly1d(fitted_params)
derivatives = np.polyder(polynomials)
y_value_at_point = polynomials(x).flatten()
slope_at_point = np.polyval(derivatives, np.arange(0, len(y)))
#-----
# x_range_ordinal_poly = np.linspace(x.min(), x.max(), len(y))
# poly_pred_x = poly.fit_transform(x_range_ordinal_poly.reshape(-1, 1))
print(f'''
x: {len(x), x},
y: {len(y), y},
''')
print(f'''
fittedparams: {fitted_params, fitted_params},
derivs: {derivatives},
y vals at point: {y_value_at_point, len(y_value_at_point)},
slope at point: {slope_at_point}''')
x: (1165, array([[ 0],
[ 1],
[ 2],
...,
[1162],
[1163],
[1164]])),
y: (1165, array([1867, 1902, 2027, ..., 1508, 1554, 1560])),
fittedparams: (array([ 1.12001971e-24, -7.91989854e-21, 2.36011784e-17, -3.88614603e-14,
3.88520208e-11, -2.43476311e-08, 9.50992557e-06, -2.22093300e-03,
2.82908130e-01, -1.62807870e+01, 1.71766195e+03]), array([ 1.12001971e-24, -7.91989854e-21, 2.36011784e-17, -3.88614603e-14,
3.88520208e-11, -2.43476311e-08, 9.50992557e-06, -2.22093300e-03,
2.82908130e-01, -1.62807870e+01, 1.71766195e+03])),
derivs: 9 8 7 6 5
1.12e-23 x - 7.128e-20 x + 1.888e-16 x - 2.72e-13 x + 2.331e-10 x
4 3 2
- 1.217e-07 x + 3.804e-05 x - 0.006663 x + 0.5658 x - 16.28,
y vals at point: (array([1717.66194592, 1701.66185556, 1686.21438827, ..., 1738.84192024,
1735.21864265, 1731.40871437]), 1165),
slope at point: [-16.28078705 -15.72159567 -15.17550335 ... -3.53128921 -3.71593226
-3.90459934]
def draw_slope_line_at_point(fig, ind, x, y, slope_at_point, verbose=False):
"""Plot a line from an index at a specific point for x values, y values and their slopes"""
y_low = (x[0] - x[ind]) * slope_at_point[ind] + y[ind]
y_high = (x[-1] - x[ind]) * slope_at_point[ind] + y[ind]
x_vals = [x[0], x[-1]]
y_vals = [y_low, y_high]
if verbose:
print((x[0] - x[ind]))
print(x[ind], x_vals, y_vals, y[ind],slope_at_point[ind])
fig.add_trace(
go.Scatter(
x=x_vals,
y=y_vals,
name="Tangent at point",
line = dict(color='orange', width=2, dash='dash'),
)
)
return x_vals, y_vals
fig = px.scatter(
x=np.arange(0, len(y)),
#x=final_df.index,
y=final_df.nr_deaths,
opacity=.3,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit 2'))
# Replace x axis ticks with dates instead numbers
# fig.update_xaxes(
# rangeslider_visible=True,
# ticktext=final_df.index.strftime('%Y-%m-%d')[::10],
# tickvals=np.arange(0, len(y))[::10],
#tickformatstops not working with ticktextZ
# )
for pt in [750 ,1080]:
# for pt in [31]:
draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
#x=final_df.index,
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
fig.add_annotation(x=pt, y=y_value_at_point[pt],
text=f'''Slope: {slope_at_point[pt]:.2f}\t {final_df.index.strftime('%Y-%m-%d')[pt]}''',
showarrow=True,
arrowhead=1)
fig.update_layout(
hovermode='x unified',
)
fig.show()
fig = px.scatter(
x=only_covid.index,
y=only_covid.covid_deaths,
trendline="ols",
trendline_color_override="red",
opacity=.5,
title='Deaths per year'
)
fig.show()
fig = px.scatter(
x=only_covid.index,
y=only_covid.covid_deaths,
trendline="lowess",
trendline_color_override="red",
trendline_options=dict(frac=0.1),
opacity=.5,
title='Deaths per year'
)
fig.show()
# regression params not available for lowess
poly_degree = 10
y = only_covid.covid_deaths.values
x = np.arange(0, len(y))
x = x.reshape(-1, 1)
# just for checking with sklearn implementation
poly = PolynomialFeatures(degree=poly_degree, include_bias=False)
poly_features = poly.fit_transform(x)
poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y)
#-----
fitted_params = np.polyfit(np.arange(0, len(y)), y, poly_degree )
polynomials = np.poly1d(fitted_params)
derivatives = np.polyder(polynomials)
y_value_at_point = polynomials(x).flatten()
slope_at_point = np.polyval(derivatives, np.arange(0, len(y)))
#-----
# x_range_ordinal_poly = np.linspace(x.min(), x.max(), len(y))
# poly_pred_x = poly.fit_transform(x_range_ordinal_poly.reshape(-1, 1))
# print(f'''
# x: {len(x), x},
# y: {len(y), y},
# ''')
print(f'''
fittedparams: {fitted_params, fitted_params},
derivs: {derivatives},
y vals at point: {y_value_at_point, len(y_value_at_point)},
slope at point: {slope_at_point}''')
fittedparams: (array([-3.29416755e-13, 1.26451941e-10, -2.02479035e-08, 1.77003587e-06,
-9.26507045e-05, 2.99527695e-03, -5.96424960e-02, 7.09907843e-01,
-4.66671787e+00, 1.25294601e+01, 1.39709027e+01]), array([-3.29416755e-13, 1.26451941e-10, -2.02479035e-08, 1.77003587e-06,
-9.26507045e-05, 2.99527695e-03, -5.96424960e-02, 7.09907843e-01,
-4.66671787e+00, 1.25294601e+01, 1.39709027e+01])),
derivs: 9 8 7 6 5
-3.294e-12 x + 1.138e-09 x - 1.62e-07 x + 1.239e-05 x - 0.0005559 x
4 3 2
+ 0.01498 x - 0.2386 x + 2.13 x - 9.333 x + 12.53,
y vals at point: (array([13.97090266, 22.4868146 , 25.17807482, 24.55934231, 22.3022525 ,
19.45536845, 16.62054399, 14.09204724, 11.96414985, 10.21228534,
8.75231638, 7.4819266 , 6.30766423, 5.1607125 , 4.00404452,
2.83323543, 1.67285297, 0.57002568, -0.41350281, -1.20978868,
-1.75276229, -1.98536217, -1.86531298, -1.36929271, -0.49537231,
0.73627198, 2.2842772 , 4.0884688 , 6.0734584 , 8.1531869 ,
10.23610819, 12.23069481, 14.05094648, 15.62159384, 16.88271135,
17.7934848 , 18.33491844, 18.51131322, 18.35040002, 17.90206911,
17.23569723, 16.43613666, 15.59849449, 14.82189396, 14.20247212,
13.82592783, 13.75999048, 14.04723062, 14.69867926, 15.68875999,
16.95206751, 18.38254582, 19.83562782, 21.13389493, 22.07679833,
22.45495252, 22.06946502, 20.75670213, 18.41880923, 15.06020257,
10.83012866, 6.07124354, 1.37399871, -2.36356978, -3.87126782,
-1.4382456 , 7.1582941 ]), 67),
slope at point: [ 1.25294601e+01 5.10161060e+00 6.95528608e-01 -1.65803685e+00
-2.68430654e+00 -2.91145774e+00 -2.71092751e+00 -2.33169590e+00
-1.92917152e+00 -1.58926187e+00 -1.34817221e+00 -1.20843916e+00
-1.15166889e+00 -1.14841469e+00 -1.16559480e+00 -1.17181845e+00
-1.14095682e+00 -1.05426499e+00 -9.01332028e-01 -6.80108222e-01
-3.96231860e-01 -6.18522935e-02 3.05878322e-01 6.86495374e-01
1.05796699e+00 1.39838761e+00 1.68753311e+00 1.90820892e+00
2.04733993e+00 2.09676685e+00 2.05372861e+00 1.92102383e+00
1.70685712e+00 1.42438670e+00 1.09100042e+00 7.27355606e-01
3.56226146e-01 1.20639674e-03 -3.14673055e-01 -5.70358814e-01
-7.48155426e-01 -8.35009954e-01 -8.23636397e-01 -7.13416735e-01
-5.11017275e-01 -2.30662127e-01 1.05990062e-01 4.70414455e-01
8.28275109e-01 1.14100051e+00 1.36790420e+00 1.46886486e+00
1.40756859e+00 1.15530372e+00 6.95284288e-01 2.74633463e-02
-8.26218782e-01 -1.81622405e+00 -2.85953514e+00 -3.83361391e+00
-4.57033195e+00 -4.85002765e+00 -4.39586760e+00 -2.86871468e+00
1.37268660e-01 5.09802684e+00 1.25617435e+01]
fig = px.scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
opacity=.5,
title='Deaths per year'
)
fig.add_traces(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit 2'))
for pt in [31]:
draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
#x=final_df.index,
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
fig.add_annotation(
x=pt,
y=y_value_at_point[pt],
text=f'''Slope: {slope_at_point[pt]:.2f}\t {only_covid.index.strftime('%Y-%m-%d')[pt]}''',
showarrow=True,
arrowhead=1)
fig.update_layout(
hovermode='x unified',
)
fig.show()
traces = []
animation_dicts = dict()
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
name='Covid deaths',
mode='lines',
opacity=.5,
line=dict(width=1.5)
))
# traces.append(
# go.Scatter(
# x=np.arange(0, len(y)),
# y=only_covid.covid_deaths,
# mode='lines',
# opacity=.5,
# line={'shape': 'spline', 'smoothing': 1.3}
# ))
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
y=only_covid.covid_deaths,
name='Covid deaths',
mode='markers',
opacity=.5,
line=dict(width=1)
))
traces.append(
go.Scatter(
x=np.arange(0, len(y)),
#x=final_df.index.strftime('%Y-%m-%d').to_list(),
y=y_value_at_point,
name='Polynomial regression Fit',
mode='lines',
opacity=.5,
line=dict(width=1.5)
))
for pt in np.arange(0, len(y)):#[31, 60]:
x_vals, y_vals = draw_slope_line_at_point(
fig,
x= np.arange(0, len(y)),
y = y_value_at_point,
slope_at_point=slope_at_point,
ind = pt)
animation_dicts[pt]= [x_vals, y_vals]
# traces.append(
# go.Scatter(
# x=x_vals,
# y=y_vals,
# mode='lines',
# opacity=.4,
# line=dict(width=1.5)))
# fig.add_annotation(
# x=pt,
# y=y_value_at_point[pt],
# text=f'''Slope: {slope_at_point[pt]:.2f}\t {only_covid.index.strftime('%Y-%m-%d')[pt]}''',
# showarrow=True,
# arrowhead=1)
frame_data = []
slider_steps = []
for k in range(0, len(final_df)):
# frame_data.append(
# dict(data=
# [dict(
# type = 'scatter',
# x=np.arange(0, len(y))[:k],
# y=only_covid.covid_deaths[:k]
# )]
# )
# )
# add slope lines
if k in animation_dicts.keys():
frame_data.append(
dict(data=
[dict(
type = 'scatter',
x=animation_dicts[k][0],
y=animation_dicts[k][1],
mode='lines',
line={'dash': 'dash', 'color': 'green'}
)]
)
)
slider_steps.append(
{"args": [
frame_data[k],
{"frame": {"duration": 300, "redraw": False},
"mode": "immediate",
"transition": {"duration": 300}}
],
"label": k,
"method": "animate"}
)
all_frames = frame_data
frames=all_frames
# dict(
# data=[
# dict(
# type = 'scatter',
# x=np.arange(0, len(y))[:k],
# y=only_covid.covid_deaths[:k]
# ),
# dict(
# type = 'scatter',
# x=animation_dicts[k][0],
# y=animation_dicts[k][1]
# ),
# ]
# )
# for k in range(0, len(final_df))
sliders_dict = {
"active": 0,
"yanchor": "top",
"xanchor": "left",
"currentvalue": {
"font": {"size": 20},
"prefix": "Week:",
"visible": True,
"xanchor": "right"
},
"transition": {"duration": 300, "easing": "cubic-in-out"},
"pad": {"b": 10, "t": 50},
"len": 0.9,
"x": 0.1,
"y": 0,
"steps": slider_steps
}
layout = go.Layout(
xaxis={"range": [0, len(y)], "title": "weeks"},
sliders=[sliders_dict],
# showlegend=False,
hovermode='x unified',
updatemenus=[
dict(
type='buttons',
buttons=[
dict(
label='Show tangents',
method='animate',
args=[None,
dict(frame=dict(duration=200,
redraw=False),
transition=dict(duration=0),
fromcurrent=True,
mode='immediate')
]),
{
"args": [[None], {"frame": {"duration": 0, "redraw": False},
"mode": "immediate",
"transition": {"duration": 0}}],
"label": "Pause",
"method": "animate"
}
],
direction="left",
pad= {"r": 10, "t": 87},
showactive= False,
x=0.1,
xanchor= "right",
y= 0,
yanchor= "top"
)
]
)
fig = go.Figure(
data=traces,
frames=frames,
layout=layout)
fig.show()
# pio.write_html(fig, file='figure.html', auto_open=True)
A proper interpretation would need a lot more research. As stated above this notebook as also technical implementational goals instead of merely analysing the deaths of people in Austria.
I, as an average educated person in this matter witness that