COVID-19 Exploratory Data Analysis
Introduction
This is a personal Rmarkdown document I have created to visualize the COVID-19 updates and some preliminary exploratory data analysis (EDA). The source of this data is the github repository created and maintained by the Coronavirus COVID-19 Global Cases by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (JHU).
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(forecast))
suppressPackageStartupMessages(library(zoo))
suppressPackageStartupMessages(library(xts))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(gghighlight))
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(directlabels))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(plotly))
#suppressPackageStartupMessages(library(rjson))
Reading the data
COVID_confirmed_global_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
COVID_deaths_global_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
COVID_recovered_global_raw <- read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
Reshaping and formatting data
# Reshape to longer format
COVID_confirmed_global_longer <- COVID_confirmed_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_confirmed_global_raw)[ncol(COVID_confirmed_global_raw)]),
names_to = "date",
values_to = "n_cases")
COVID_deaths_global_longer <- COVID_deaths_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_deaths_global_raw)[ncol(COVID_deaths_global_raw)]),
names_to = "date",
values_to = "n_cases")
COVID_recovered_global_longer <- COVID_recovered_global_raw %>%
pivot_longer(cols = c('1/22/20':names(COVID_recovered_global_raw)[ncol(COVID_recovered_global_raw)]),
names_to = "date",
values_to = "n_cases")
# change column names
colnames(COVID_confirmed_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_deaths_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
colnames(COVID_recovered_global_longer) <- c('state', 'country', 'lat', 'long','date', 'n_cases')
# drop `state` column and create a `new_cases` column
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>%
select(-state)%>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
COVID_deaths_global_longer <- COVID_deaths_global_longer %>%
select(-state)%>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
COVID_recovered_global_longer <- COVID_recovered_global_longer %>%
select(-state) %>%
group_by(country, date) %>%
summarize(n_cases = sum(n_cases))
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# convert date columns from character to date format
COVID_confirmed_global_longer$date <- as.Date(COVID_confirmed_global_longer$date, format = '%m/%d/%Y')
COVID_deaths_global_longer$date <- as.Date(COVID_deaths_global_longer$date, format = '%m/%d/%Y')
COVID_recovered_global_longer$date <- as.Date(COVID_recovered_global_longer$date, format = '%m/%d/%Y')
COVID_confirmed_global_longer <- COVID_confirmed_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
COVID_deaths_global_longer <- COVID_deaths_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
COVID_recovered_global_longer <- COVID_recovered_global_longer %>%
arrange(country, date) %>%
mutate(new_cases = n_cases-lag(n_cases, default = 0))
Let’s look at the current data format
knitr::kable(head(COVID_confirmed_global_longer),format = 'markdown')
country | date | n_cases | new_cases |
---|---|---|---|
Afghanistan | 20-01-22 | 0 | 0 |
Afghanistan | 20-01-23 | 0 | 0 |
Afghanistan | 20-01-24 | 0 | 0 |
Afghanistan | 20-01-25 | 0 | 0 |
Afghanistan | 20-01-26 | 0 | 0 |
Afghanistan | 20-01-27 | 0 | 0 |
creating some functions for quick stats
world_summary <- function() {
df1 <- COVID_confirmed_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
df2 <- COVID_deaths_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
df3 <- COVID_recovered_global_longer %>%
group_by(country) %>%
summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases)) %>%
ungroup() %>%
summarize(n_cases_total = sum(n_cases_today),
new_cases_total = sum(new_cases_today))
print(paste0("number of total confirmed cases in the world as of today: ", df1$n_cases_total, " with ", df1$new_cases_total, " new cases"))
print(paste0("number of total deaths in the world as of today: ", df2$n_cases_total, " with ", df2$new_cases_total, " new deaths"))
print(paste0("number of total recovered cases in the world as of today: ", df3$n_cases_total, " with ", df3$new_cases_total, " new cases"))
}
country_summary <- function(country1) {
df1 <- COVID_confirmed_global_longer %>% group_by(country) %>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
df2 <- COVID_deaths_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
df3 <- COVID_recovered_global_longer %>% group_by(country)%>% dplyr::filter(country==country1) %>% summarize(n_cases_today = max(n_cases),
new_cases_today = dplyr::last(new_cases))
#
print(paste0("number of confirmed cases in ", country1, " as of today: ", df1$n_cases_today, " with ", df1$new_cases_today, " new cases"))
# df1$n_cases_today
print(paste0("number of deaths in ", country1, " as of today: ", df2$n_cases_today, " with ", df2$new_cases_today, " new deaths"))
# df2$n_cases_today
print(paste0("number of recovered cases in ", country1, " as of today: ", df3$n_cases_today, " with ", df3$new_cases_today, " new cases"))
# df3$n_cases_today
}
world_summary()
## [1] "number of total confirmed cases in the world as of today: 227649349 with 593099 new cases"
## [1] "number of total deaths in the world as of today: 4679139 with 8881 new deaths"
## [1] "number of total recovered cases in the world as of today: 137249983 with 0 new cases"
country_summary("US")
## [1] "number of confirmed cases in US as of today: 41993789 with 207886 new cases"
## [1] "number of deaths in US as of today: 672635 with 2635 new deaths"
## [1] "number of recovered cases in US as of today: 6298082 with 0 new cases"
country_summary("Italy")
## [1] "number of confirmed cases in Italy as of today: 4627699 with 4544 new cases"
## [1] "number of deaths in Italy as of today: 130233 with 66 new deaths"
## [1] "number of recovered cases in Italy as of today: 4144608 with 0 new cases"
country_summary("Spain")
## [1] "number of confirmed cases in Spain as of today: 4929546 with 3222 new cases"
## [1] "number of deaths in Spain as of today: 85783 with 44 new deaths"
## [1] "number of recovered cases in Spain as of today: 150376 with 0 new cases"
country_summary("China")
## [1] "number of confirmed cases in China as of today: 107838 with 49 new cases"
## [1] "number of deaths in China as of today: 4851 with 0 new deaths"
## [1] "number of recovered cases in China as of today: 99228 with 0 new cases"
country_summary("Egypt")
## [1] "number of confirmed cases in Egypt as of today: 295639 with 588 new cases"
## [1] "number of deaths in Egypt as of today: 16935 with 14 new deaths"
## [1] "number of recovered cases in Egypt as of today: 232179 with 0 new cases"
country_summary("Germany")
## [1] "number of confirmed cases in Germany as of today: 4137062 with 9904 new cases"
## [1] "number of deaths in Germany as of today: 92928 with 22 new deaths"
## [1] "number of recovered cases in Germany as of today: 3659260 with 0 new cases"
country_summary("France")
## [1] "number of confirmed cases in France as of today: 7029959 with 7756 new cases"
## [1] "number of deaths in France as of today: 116618 with 107 new deaths"
## [1] "number of recovered cases in France as of today: 415111 with 0 new cases"