# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import random
import seaborn as sns
from fbprophet import Prophet

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

corona_df = pd.read_csv('/content/covid-data.csv')

corona_df.head()

	iso_code	continent	location	date	total_tests	new_tests	total_tests_per_thousand	new_tests_per_thousand	new_tests_smoothed	new_tests_smoothed_per_thousand	tests_units	stringency_index	population	population_density	median_age	aged_65_older	aged_70_older	gdp_per_capita	extreme_poverty	cvd_death_rate	diabetes_prevalence	female_smokers	male_smokers	handwashing_facilities	hospital_beds_per_thousand
0	AFG	Asia	Afghanistan	2019-12-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	38928341.0	54.422	18.6	2.581	1.337	1803.987	NaN	597.029	9.59	NaN	NaN	37.746	0.5
1	AFG	Asia	Afghanistan	2020-01-01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	38928341.0	54.422	18.6	2.581	1.337	1803.987	NaN	597.029	9.59	NaN	NaN	37.746	0.5
2	AFG	Asia	Afghanistan	2020-01-02	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	38928341.0	54.422	18.6	2.581	1.337	1803.987	NaN	597.029	9.59	NaN	NaN	37.746	0.5
3	AFG	Asia	Afghanistan	2020-01-03	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	38928341.0	54.422	18.6	2.581	1.337	1803.987	NaN	597.029	9.59	NaN	NaN	37.746	0.5
4	AFG	Asia	Afghanistan	2020-01-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	38928341.0	54.422	18.6	2.581	1.337	1803.987	NaN	597.029	9.59	NaN	NaN	37.746	0.5

corona_df.describe()

	total_cases	new_cases	total_deaths	new_deaths	total_cases_per_million	new_cases_per_million	total_deaths_per_million	new_deaths_per_million	total_tests	new_tests	total_tests_per_thousand	new_tests_per_thousand	new_tests_smoothed	new_tests_smoothed_per_thousand	stringency_index	population	population_density	median_age	aged_65_older	aged_70_older	gdp_per_capita	extreme_poverty	cvd_death_rate	diabetes_prevalence	female_smokers	male_smokers	handwashing_facilities	hospital_beds_per_thousand
count	2.319400e+04	23194.000000	23194.000000	23194.000000	22901.000000	22901.000000	22901.000000	22901.000000	6.371000e+03	5.716000e+03	6371.000000	5716.000000	6896.000000	6896.000000	18857.000000	2.313000e+04	22216.00000	21019.000000	20728.000000	20912.000000	20796.000000	13896.000000	21041.000000	21714.000000	16995.000000	16804.000000	9332.000000	19320.000000
mean	2.455310e+04	633.229456	1567.292446	35.908425	646.193628	14.271963	27.594876	0.546684	3.384107e+05	1.066458e+04	16.008429	0.436121	10098.671259	0.404527	57.374451	1.038567e+08	382.61529	32.083791	9.717859	6.181671	22503.837537	10.554548	246.579234	8.013587	11.216356	32.642040	54.635245	3.203885
std	2.485441e+05	5578.944712	15896.244721	332.147376	1755.723341	63.294523	103.546689	3.341644	1.278046e+06	4.396397e+04	27.821141	0.672088	34980.210118	0.570997	33.003297	6.721039e+08	1748.55980	8.966658	6.443664	4.423402	21089.731603	17.943291	118.720714	4.062651	10.556220	13.254162	31.120507	2.592908
min	0.000000e+00	-2461.000000	0.000000	-1918.000000	0.000000	-265.189000	0.000000	-41.023000	1.000000e+00	1.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	8.090000e+02	0.13700	15.100000	1.144000	0.526000	661.240000	0.100000	79.370000	0.990000	0.100000	7.700000	1.188000	0.100000
25%	1.000000e+01	0.000000	0.000000	0.000000	1.615000	0.000000	0.000000	0.000000	1.307600e+04	6.650000e+02	0.611500	0.040000	765.750000	0.039000	26.390000	2.083380e+06	41.28500	25.200000	3.853000	2.279000	6397.360000	0.500000	151.689000	5.310000	1.900000	21.400000	24.097000	1.400000
50%	1.430000e+02	3.000000	3.000000	0.000000	53.285000	0.366000	0.421000	0.000000	6.180100e+04	2.199500e+03	3.747000	0.176000	2463.000000	0.177000	70.830000	9.449321e+06	93.10500	32.100000	7.304000	4.674500	15524.995000	1.600000	234.499000	7.110000	6.900000	31.400000	59.607000	2.600000
75%	1.817000e+03	56.000000	42.000000	1.000000	362.363000	6.960000	7.496000	0.081000	2.280040e+05	7.112500e+03	19.427000	0.591000	6460.000000	0.576000	84.260000	3.346920e+07	222.87300	40.300000	15.070000	9.788000	35044.670000	12.000000	317.840000	10.080000	19.800000	40.800000	83.841000	4.210000
max	7.343562e+06	133510.000000	416430.000000	10520.000000	25544.418000	4944.376000	1237.551000	200.040000	2.241888e+07	1.318362e+06	236.859000	11.229000	532075.000000	4.993000	100.000000	7.794799e+09	19347.50000	48.200000	27.049000	18.493000	116935.600000	77.600000	724.417000	23.360000	44.000000	78.100000	98.999000	13.800000

corona_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23194 entries, 0 to 23193
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   iso_code                         23130 non-null  object 
 1   continent                        22966 non-null  object 
 2   location                         23194 non-null  object 
 3   date                             23194 non-null  object 
 4   total_cases                      23194 non-null  int64  
 5   new_cases                        23194 non-null  int64  
 6   total_deaths                     23194 non-null  int64  
 7   new_deaths                       23194 non-null  int64  
 8   total_cases_per_million          22901 non-null  float64
 9   new_cases_per_million            22901 non-null  float64
 10  total_deaths_per_million         22901 non-null  float64
 11  new_deaths_per_million           22901 non-null  float64
 12  total_tests                      6371 non-null   float64
 13  new_tests                        5716 non-null   float64
 14  total_tests_per_thousand         6371 non-null   float64
 15  new_tests_per_thousand           5716 non-null   float64
 16  new_tests_smoothed               6896 non-null   float64
 17  new_tests_smoothed_per_thousand  6896 non-null   float64
 18  tests_units                      7500 non-null   object 
 19  stringency_index                 18857 non-null  float64
 20  population                       23130 non-null  float64
 21  population_density               22216 non-null  float64
 22  median_age                       21019 non-null  float64
 23  aged_65_older                    20728 non-null  float64
 24  aged_70_older                    20912 non-null  float64
 25  gdp_per_capita                   20796 non-null  float64
 26  extreme_poverty                  13896 non-null  float64
 27  cvd_death_rate                   21041 non-null  float64
 28  diabetes_prevalence              21714 non-null  float64
 29  female_smokers                   16995 non-null  float64
 30  male_smokers                     16804 non-null  float64
 31  handwashing_facilities           9332 non-null   float64
 32  hospital_beds_per_thousand       19320 non-null  float64
dtypes: float64(24), int64(4), object(5)
memory usage: 5.8+ MB

corona_df = corona_df.dropna(how='any',axis=0)
corona_df.isnull().sum()

iso_code                           0
continent                          0
location                           0
date                               0
total_cases                        0
new_cases                          0
total_deaths                       0
new_deaths                         0
total_cases_per_million            0
new_cases_per_million              0
total_deaths_per_million           0
new_deaths_per_million             0
total_tests                        0
new_tests                          0
total_tests_per_thousand           0
new_tests_per_thousand             0
new_tests_smoothed                 0
new_tests_smoothed_per_thousand    0
tests_units                        0
stringency_index                   0
population                         0
population_density                 0
median_age                         0
aged_65_older                      0
aged_70_older                      0
gdp_per_capita                     0
extreme_poverty                    0
cvd_death_rate                     0
diabetes_prevalence                0
female_smokers                     0
male_smokers                       0
handwashing_facilities             0
hospital_beds_per_thousand         0
dtype: int64

corona_df = corona_df.sort_values('date')

plt.figure(figsize=(10, 10))
plt.plot(corona_df['date'], corona_df['new_cases'])

[<matplotlib.lines.Line2D at 0x7f454ad9f668>]

plt.figure(figsize=(10, 6))
sns.distplot(corona_df['new_cases'], color='blue')

<matplotlib.axes._subplots.AxesSubplot at 0x7f454ac07e80>

sns.violinplot(y='new_cases', x = 'continent', data = corona_df)

<matplotlib.axes._subplots.AxesSubplot at 0x7f454ab01668>

sns.set(font_scale=0.7) 
plt.figure(figsize=[25,12])
sns.countplot(x = 'location', data = corona_df)
plt.xticks(rotation = 45)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]), <a list of 20 Text major ticklabel objects>)

sns.set(font_scale=1.5) 
plt.figure(figsize=[25,12])
sns.countplot(x = 'date', data = corona_df)
plt.xticks(rotation = 45)

(array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147]),
 <a list of 148 Text major ticklabel objects>)

 # plot the avocado prices vs. regions for conventional avocados
conventional = sns.catplot('new_cases', 'location', data = corona_df[corona_df['continent']=='Asia'],
                              hue = 'date',
                              height = 20)

corona_prophet_df = corona_df[['date', 'new_cases']]
corona_prophet_df

	date	new_cases
13670	2020-01-08	0
13671	2020-01-09	0
13672	2020-01-10	0
13673	2020-01-11	0
13674	2020-01-12	0
...	...	...
9940	2020-06-10	1043
13824	2020-06-10	4199
21510	2020-06-10	11
7002	2020-06-11	170
11460	2020-06-11	105

1335 rows × 2 columns

corona_prophet_df = corona_prophet_df.rename(columns={'date': 'ds' , 'new_cases': 'y'})

corona_prophet_df

	ds	y
13670	2020-01-08	0
13671	2020-01-09	0
13672	2020-01-10	0
13673	2020-01-11	0
13674	2020-01-12	0
...	...	...
9940	2020-06-10	1043
13824	2020-06-10	4199
21510	2020-06-10	11
7002	2020-06-11	170
11460	2020-06-11	105

1335 rows × 2 columns

m = Prophet()
m.fit(corona_prophet_df)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.

<fbprophet.forecaster.Prophet at 0x7f454a6b7ba8>

# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)

forecast

	ds	trend	yhat_lower	yhat_upper	trend_lower	trend_upper	additive_terms	additive_terms_lower	additive_terms_upper	weekly	weekly_lower	weekly_upper	multiplicative_terms	multiplicative_terms_lower	multiplicative_terms_upper	yhat
0	2020-01-08	-164.691320	-1606.623289	1308.194285	-164.691320	-164.691320	-46.944830	-46.944830	-46.944830	-46.944830	-46.944830	-46.944830	0.0	0.0	0.0	-211.636150
1	2020-01-09	-161.768018	-1473.141429	1304.294373	-161.768018	-161.768018	-12.821984	-12.821984	-12.821984	-12.821984	-12.821984	-12.821984	0.0	0.0	0.0	-174.590002
2	2020-01-10	-158.844717	-1615.193644	1199.349348	-158.844717	-158.844717	44.432023	44.432023	44.432023	44.432023	44.432023	44.432023	0.0	0.0	0.0	-114.412694
3	2020-01-11	-155.921415	-1609.482813	1318.206549	-155.921415	-155.921415	50.075492	50.075492	50.075492	50.075492	50.075492	50.075492	0.0	0.0	0.0	-105.845923
4	2020-01-12	-152.998113	-1658.436506	1372.910912	-152.998113	-152.998113	59.454269	59.454269	59.454269	59.454269	59.454269	59.454269	0.0	0.0	0.0	-93.543845
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1695	2021-06-07	11232.181340	8288.595338	14079.923067	8569.916410	13707.460293	-14.797910	-14.797910	-14.797910	-14.797910	-14.797910	-14.797910	0.0	0.0	0.0	11217.383429
1696	2021-06-08	11259.223897	8062.106870	14064.176661	8578.363795	13735.282315	-79.397059	-79.397059	-79.397059	-79.397059	-79.397059	-79.397059	0.0	0.0	0.0	11179.826838
1697	2021-06-09	11286.266455	8079.740194	14235.994668	8596.767033	13761.036343	-46.944830	-46.944830	-46.944830	-46.944830	-46.944830	-46.944830	0.0	0.0	0.0	11239.321625
1698	2021-06-10	11313.309012	8248.612153	14344.310277	8605.909544	13786.790372	-12.821984	-12.821984	-12.821984	-12.821984	-12.821984	-12.821984	0.0	0.0	0.0	11300.487028
1699	2021-06-11	11340.351570	8240.869140	14286.654386	8619.533521	13817.855914	44.432023	44.432023	44.432023	44.432023	44.432023	44.432023	0.0	0.0	0.0	11384.783592

1700 rows × 16 columns

figure = m.plot(forecast, xlabel = 'date', ylabel = 'cases')

figure2 = m.plot_components(forecast)

Region specific prediction¶

# dataframes creation for both training and testing datasets 
df = pd.read_csv('/content/covid-data.csv')

df = df.dropna(how='any',axis=0)
df_sample = df[df['location']=='India']

df_sample = df_sample.sort_values('date')

plt.plot(df_sample['date'], df_sample['new_cases'])

INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.

[<matplotlib.lines.Line2D at 0x7f45455e3a58>]

plt.plot(df_sample['date'], df_sample['new_deaths'])

INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.

[<matplotlib.lines.Line2D at 0x7f454550df60>]

df_sample = df_sample.rename(columns = {'date': 'ds', 'new_cases': 'y'})

m = Prophet()
m.fit(df_sample)
# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.

figure = m.plot(forecast, xlabel='date', ylabel='cases')

figure3 = m.plot_components(forecast)

df_sample = df_sample.rename(columns = {'date': 'ds', 'new_deaths': 'y'})
m = Prophet()
m.fit(df_sample)
# Forcasting into the future
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
figure = m.plot(forecast, xlabel='date', ylabel='cases')
figure3 = m.plot_components(forecast)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.