Open In Colab

# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import random
import seaborn as sns
from fbprophet import Prophet
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
corona_df = pd.read_csv('/content/covid-data.csv')
corona_df.head()
iso_code continent location date total_cases new_cases total_deaths new_deaths total_cases_per_million new_cases_per_million total_deaths_per_million new_deaths_per_million total_tests new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed new_tests_smoothed_per_thousand tests_units stringency_index population population_density median_age aged_65_older aged_70_older gdp_per_capita extreme_poverty cvd_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
0 AFG Asia Afghanistan 2019-12-31 0 0 0 0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN NaN 38928341.0 54.422 18.6 2.581 1.337 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5
1 AFG Asia Afghanistan 2020-01-01 0 0 0 0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 38928341.0 54.422 18.6 2.581 1.337 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5
2 AFG Asia Afghanistan 2020-01-02 0 0 0 0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 38928341.0 54.422 18.6 2.581 1.337 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5
3 AFG Asia Afghanistan 2020-01-03 0 0 0 0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 38928341.0 54.422 18.6 2.581 1.337 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5
4 AFG Asia Afghanistan 2020-01-04 0 0 0 0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN NaN NaN 0.0 38928341.0 54.422 18.6 2.581 1.337 1803.987 NaN 597.029 9.59 NaN NaN 37.746 0.5
corona_df.describe()
total_cases new_cases total_deaths new_deaths total_cases_per_million new_cases_per_million total_deaths_per_million new_deaths_per_million total_tests new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed new_tests_smoothed_per_thousand stringency_index population population_density median_age aged_65_older aged_70_older gdp_per_capita extreme_poverty cvd_death_rate diabetes_prevalence female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
count 2.319400e+04 23194.000000 23194.000000 23194.000000 22901.000000 22901.000000 22901.000000 22901.000000 6.371000e+03 5.716000e+03 6371.000000 5716.000000 6896.000000 6896.000000 18857.000000 2.313000e+04 22216.00000 21019.000000 20728.000000 20912.000000 20796.000000 13896.000000 21041.000000 21714.000000 16995.000000 16804.000000 9332.000000 19320.000000
mean 2.455310e+04 633.229456 1567.292446 35.908425 646.193628 14.271963 27.594876 0.546684 3.384107e+05 1.066458e+04 16.008429 0.436121 10098.671259 0.404527 57.374451 1.038567e+08 382.61529 32.083791 9.717859 6.181671 22503.837537 10.554548 246.579234 8.013587 11.216356 32.642040 54.635245 3.203885
std 2.485441e+05 5578.944712 15896.244721 332.147376 1755.723341 63.294523 103.546689 3.341644 1.278046e+06 4.396397e+04 27.821141 0.672088 34980.210118 0.570997 33.003297 6.721039e+08 1748.55980 8.966658 6.443664 4.423402 21089.731603 17.943291 118.720714 4.062651 10.556220 13.254162 31.120507 2.592908
min 0.000000e+00 -2461.000000 0.000000 -1918.000000 0.000000 -265.189000 0.000000 -41.023000 1.000000e+00 1.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000 8.090000e+02 0.13700 15.100000 1.144000 0.526000 661.240000 0.100000 79.370000 0.990000 0.100000 7.700000 1.188000 0.100000
25% 1.000000e+01 0.000000 0.000000 0.000000 1.615000 0.000000 0.000000 0.000000 1.307600e+04 6.650000e+02 0.611500 0.040000 765.750000 0.039000 26.390000 2.083380e+06 41.28500 25.200000 3.853000 2.279000 6397.360000 0.500000 151.689000 5.310000 1.900000 21.400000 24.097000 1.400000
50% 1.430000e+02 3.000000 3.000000 0.000000 53.285000 0.366000 0.421000 0.000000 6.180100e+04 2.199500e+03 3.747000 0.176000 2463.000000 0.177000 70.830000 9.449321e+06 93.10500 32.100000 7.304000 4.674500 15524.995000 1.600000 234.499000 7.110000 6.900000 31.400000 59.607000 2.600000
75% 1.817000e+03 56.000000 42.000000 1.000000 362.363000 6.960000 7.496000 0.081000 2.280040e+05 7.112500e+03 19.427000 0.591000 6460.000000 0.576000 84.260000 3.346920e+07 222.87300 40.300000 15.070000 9.788000 35044.670000 12.000000 317.840000 10.080000 19.800000 40.800000 83.841000 4.210000
max 7.343562e+06 133510.000000 416430.000000 10520.000000 25544.418000 4944.376000 1237.551000 200.040000 2.241888e+07 1.318362e+06 236.859000 11.229000 532075.000000 4.993000 100.000000 7.794799e+09 19347.50000 48.200000 27.049000 18.493000 116935.600000 77.600000 724.417000 23.360000 44.000000 78.100000 98.999000 13.800000
corona_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23194 entries, 0 to 23193
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   iso_code                         23130 non-null  object 
 1   continent                        22966 non-null  object 
 2   location                         23194 non-null  object 
 3   date                             23194 non-null  object 
 4   total_cases                      23194 non-null  int64  
 5   new_cases                        23194 non-null  int64  
 6   total_deaths                     23194 non-null  int64  
 7   new_deaths                       23194 non-null  int64  
 8   total_cases_per_million          22901 non-null  float64
 9   new_cases_per_million            22901 non-null  float64
 10  total_deaths_per_million         22901 non-null  float64
 11  new_deaths_per_million           22901 non-null  float64
 12  total_tests                      6371 non-null   float64
 13  new_tests                        5716 non-null   float64
 14  total_tests_per_thousand         6371 non-null   float64
 15  new_tests_per_thousand           5716 non-null   float64
 16  new_tests_smoothed               6896 non-null   float64
 17  new_tests_smoothed_per_thousand  6896 non-null   float64
 18  tests_units                      7500 non-null   object 
 19  stringency_index                 18857 non-null  float64
 20  population                       23130 non-null  float64
 21  population_density               22216 non-null  float64
 22  median_age                       21019 non-null  float64
 23  aged_65_older                    20728 non-null  float64
 24  aged_70_older                    20912 non-null  float64
 25  gdp_per_capita                   20796 non-null  float64
 26  extreme_poverty                  13896 non-null  float64
 27  cvd_death_rate                   21041 non-null  float64
 28  diabetes_prevalence              21714 non-null  float64
 29  female_smokers                   16995 non-null  float64
 30  male_smokers                     16804 non-null  float64
 31  handwashing_facilities           9332 non-null   float64
 32  hospital_beds_per_thousand       19320 non-null  float64
dtypes: float64(24), int64(4), object(5)
memory usage: 5.8+ MB
corona_df = corona_df.dropna(how='any',axis=0)
corona_df.isnull().sum()
iso_code                           0
continent                          0
location                           0
date                               0
total_cases                        0
new_cases                          0
total_deaths                       0
new_deaths                         0
total_cases_per_million            0
new_cases_per_million              0
total_deaths_per_million           0
new_deaths_per_million             0
total_tests                        0
new_tests                          0
total_tests_per_thousand           0
new_tests_per_thousand             0
new_tests_smoothed                 0
new_tests_smoothed_per_thousand    0
tests_units                        0
stringency_index                   0
population                         0
population_density                 0
median_age                         0
aged_65_older                      0
aged_70_older                      0
gdp_per_capita                     0
extreme_poverty                    0
cvd_death_rate                     0
diabetes_prevalence                0
female_smokers                     0
male_smokers                       0
handwashing_facilities             0
hospital_beds_per_thousand         0
dtype: int64
corona_df = corona_df.sort_values('date')
plt.figure(figsize=(10, 10))
plt.plot(corona_df['date'], corona_df['new_cases'])
[<matplotlib.lines.Line2D at 0x7f454ad9f668>]
plt.figure(figsize=(10, 6))
sns.distplot(corona_df['new_cases'], color='blue')
<matplotlib.axes._subplots.AxesSubplot at 0x7f454ac07e80>
sns.violinplot(y='new_cases', x = 'continent', data = corona_df)
<matplotlib.axes._subplots.AxesSubplot at 0x7f454ab01668>
sns.set(font_scale=0.7) 
plt.figure(figsize=[25,12])
sns.countplot(x = 'location', data = corona_df)
plt.xticks(rotation = 45)
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]), <a list of 20 Text major ticklabel objects>)
sns.set(font_scale=1.5) 
plt.figure(figsize=[25,12])
sns.countplot(x = 'date', data = corona_df)
plt.xticks(rotation = 45)
(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147]),
 <a list of 148 Text major ticklabel objects>)
 # plot the avocado prices vs. regions for conventional avocados
conventional = sns.catplot('new_cases', 'location', data = corona_df[corona_df['continent']=='Asia'],
                              hue = 'date',
                              height = 20)
corona_prophet_df = corona_df[['date', 'new_cases']]
corona_prophet_df
date new_cases
13670 2020-01-08 0
13671 2020-01-09 0
13672 2020-01-10 0
13673 2020-01-11 0
13674 2020-01-12 0
... ... ...
9940 2020-06-10 1043
13824 2020-06-10 4199
21510 2020-06-10 11
7002 2020-06-11 170
11460 2020-06-11 105

1335 rows × 2 columns

corona_prophet_df = corona_prophet_df.rename(columns={'date': 'ds' , 'new_cases': 'y'})
corona_prophet_df
ds y
13670 2020-01-08 0
13671 2020-01-09 0
13672 2020-01-10 0
13673 2020-01-11 0
13674 2020-01-12 0
... ... ...
9940 2020-06-10 1043
13824 2020-06-10 4199
21510 2020-06-10 11
7002 2020-06-11 170
11460 2020-06-11 105

1335 rows × 2 columns

m = Prophet()
m.fit(corona_prophet_df)
INFO:numexpr.utils:NumExpr defaulting to 2 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
<fbprophet.forecaster.Prophet at 0x7f454a6b7ba8>
# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
forecast
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper weekly weekly_lower weekly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2020-01-08 -164.691320 -1606.623289 1308.194285 -164.691320 -164.691320 -46.944830 -46.944830 -46.944830 -46.944830 -46.944830 -46.944830 0.0 0.0 0.0 -211.636150
1 2020-01-09 -161.768018 -1473.141429 1304.294373 -161.768018 -161.768018 -12.821984 -12.821984 -12.821984 -12.821984 -12.821984 -12.821984 0.0 0.0 0.0 -174.590002
2 2020-01-10 -158.844717 -1615.193644 1199.349348 -158.844717 -158.844717 44.432023 44.432023 44.432023 44.432023 44.432023 44.432023 0.0 0.0 0.0 -114.412694
3 2020-01-11 -155.921415 -1609.482813 1318.206549 -155.921415 -155.921415 50.075492 50.075492 50.075492 50.075492 50.075492 50.075492 0.0 0.0 0.0 -105.845923
4 2020-01-12 -152.998113 -1658.436506 1372.910912 -152.998113 -152.998113 59.454269 59.454269 59.454269 59.454269 59.454269 59.454269 0.0 0.0 0.0 -93.543845
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1695 2021-06-07 11232.181340 8288.595338 14079.923067 8569.916410 13707.460293 -14.797910 -14.797910 -14.797910 -14.797910 -14.797910 -14.797910 0.0 0.0 0.0 11217.383429
1696 2021-06-08 11259.223897 8062.106870 14064.176661 8578.363795 13735.282315 -79.397059 -79.397059 -79.397059 -79.397059 -79.397059 -79.397059 0.0 0.0 0.0 11179.826838
1697 2021-06-09 11286.266455 8079.740194 14235.994668 8596.767033 13761.036343 -46.944830 -46.944830 -46.944830 -46.944830 -46.944830 -46.944830 0.0 0.0 0.0 11239.321625
1698 2021-06-10 11313.309012 8248.612153 14344.310277 8605.909544 13786.790372 -12.821984 -12.821984 -12.821984 -12.821984 -12.821984 -12.821984 0.0 0.0 0.0 11300.487028
1699 2021-06-11 11340.351570 8240.869140 14286.654386 8619.533521 13817.855914 44.432023 44.432023 44.432023 44.432023 44.432023 44.432023 0.0 0.0 0.0 11384.783592

1700 rows × 16 columns

figure = m.plot(forecast, xlabel = 'date', ylabel = 'cases')
figure2 = m.plot_components(forecast)

Region specific prediction

# dataframes creation for both training and testing datasets 
df = pd.read_csv('/content/covid-data.csv')
df = df.dropna(how='any',axis=0)
df_sample = df[df['location']=='India']
df_sample = df_sample.sort_values('date')
plt.plot(df_sample['date'], df_sample['new_cases'])
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
[<matplotlib.lines.Line2D at 0x7f45455e3a58>]
plt.plot(df_sample['date'], df_sample['new_deaths'])
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
[<matplotlib.lines.Line2D at 0x7f454550df60>]
df_sample = df_sample.rename(columns = {'date': 'ds', 'new_cases': 'y'})
m = Prophet()
m.fit(df_sample)
# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
figure = m.plot(forecast, xlabel='date', ylabel='cases')
figure3 = m.plot_components(forecast)
df_sample = df_sample.rename(columns = {'date': 'ds', 'new_deaths': 'y'})
m = Prophet()
m.fit(df_sample)
# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
figure = m.plot(forecast, xlabel='date', ylabel='cases')
figure3 = m.plot_components(forecast)
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.