Many Regressions

Publish date: Nov 14, 2021
Tags: Stats
library(broom)
library(forcats)
yearMin <- 1952
many_models <- gapminder %>% 
  # create a dataframe containing a separate dataframe for each country
  group_by(country,continent) %>% 
  
  # nesting creates a list-column of data frames; 
  # we will use the list to fit a regression model for every country
  nest()  %>% 
  
  # Run a simple regression model for every country in the dataframe
  mutate(simple_model = data %>% 
           map(~lm(lifeExp ~ I(year - yearMin), data = .))) %>% 
  
  # extract coefficients and model details with broom::tidy
  mutate(coefs = simple_model %>% map(~ tidy(., conf.int = TRUE)),
         details = simple_model %>% map(glance)) %>% 
  ungroup()

head(many_models, 10)
## # A tibble: 10 × 6
##    country     continent data              simple_model coefs    details 
##    <fct>       <fct>     <list>            <list>       <list>   <list>  
##  1 Afghanistan Asia      <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  2 Albania     Europe    <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  3 Algeria     Africa    <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  4 Angola      Africa    <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  5 Argentina   Americas  <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  6 Australia   Oceania   <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  7 Austria     Europe    <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  8 Bahrain     Asia      <tibble [12 × 4]> <lm>         <tibble> <tibble>
##  9 Bangladesh  Asia      <tibble [12 × 4]> <lm>         <tibble> <tibble>
## 10 Belgium     Europe    <tibble [12 × 4]> <lm>         <tibble> <tibble>

The resulting dataframe many_models contains for every country the country data, the regression model simple_model, the model coefficients coefs, as well as all the regression details. As discussed earlier, the intercept is the predicted life expectancy in 1952 and the slope is the average yearly improvement in life expectancy.

We will take our many_models dataframe, unnest the coefs and extract the interecept and slope for each country.

intercepts <- 
  many_models %>% 
  # Unnesting flattens a list-column of data frames back into regular columns
  unnest(coefs) %>% 
  filter(term == "(Intercept)") %>% 
  arrange(estimate) %>% 
  mutate(country = fct_inorder(country)) %>% 
  select(country, continent, estimate, std.error, conf.low, conf.high)

# let us look at the first 20 intercepts, or life expectancy in 1952
head(intercepts,20)
## # A tibble: 20 × 6
##    country           continent estimate std.error conf.low conf.high
##    <fct>             <fct>        <dbl>     <dbl>    <dbl>     <dbl>
##  1 Gambia            Africa        28.4     0.623     27.0      29.8
##  2 Afghanistan       Asia          29.9     0.664     28.4      31.4
##  3 Yemen, Rep.       Asia          30.1     0.861     28.2      32.0
##  4 Sierra Leone      Africa        30.9     0.448     29.9      31.9
##  5 Guinea            Africa        31.6     0.649     30.1      33.0
##  6 Guinea-Bissau     Africa        31.7     0.349     31.0      32.5
##  7 Angola            Africa        32.1     0.764     30.4      33.8
##  8 Mali              Africa        33.1     0.262     32.5      33.6
##  9 Mozambique        Africa        34.2     1.24      31.4      37.0
## 10 Equatorial Guinea Africa        34.4     0.178     34.0      34.8
## 11 Nepal             Asia          34.4     0.502     33.3      35.5
## 12 Somalia           Africa        34.7     1.01      32.4      36.9
## 13 Burkina Faso      Africa        34.7     1.11      32.2      37.2
## 14 Niger             Africa        35.2     1.19      32.5      37.8
## 15 Eritrea           Africa        35.7     0.813     33.9      37.5
## 16 Ethiopia          Africa        36.0     0.569     34.8      37.3
## 17 Bangladesh        Asia          36.1     0.530     35.0      37.3
## 18 Djibouti          Africa        36.3     0.612     34.9      37.6
## 19 Madagascar        Africa        36.7     0.304     36.0      37.3
## 20 Senegal           Africa        36.7     0.506     35.6      37.9
slopes <- many_models %>% 
  unnest(coefs) %>% 
  filter(term == "I(year - yearMin)") %>% 
  arrange(estimate) %>% 
  mutate(country = fct_inorder(country)) %>% 
  select(country, continent, estimate, std.error, conf.low, conf.high)

# let us look at the first 20 slopes, or average improvement in life exepctancy per year
head(slopes,20)
## # A tibble: 20 × 6
##    country          continent estimate std.error conf.low conf.high
##    <fct>            <fct>        <dbl>     <dbl>    <dbl>     <dbl>
##  1 Zimbabwe         Africa     -0.0930   0.121   -0.362       0.175
##  2 Zambia           Africa     -0.0604   0.0757  -0.229       0.108
##  3 Rwanda           Africa     -0.0458   0.110   -0.290       0.199
##  4 Botswana         Africa      0.0607   0.102   -0.167       0.288
##  5 Congo, Dem. Rep. Africa      0.0939   0.0406   0.00338     0.184
##  6 Swaziland        Africa      0.0951   0.111   -0.153       0.343
##  7 Lesotho          Africa      0.0956   0.0992  -0.126       0.317
##  8 Liberia          Africa      0.0960   0.0297   0.0299      0.162
##  9 Denmark          Europe      0.121    0.00667  0.106       0.136
## 10 Uganda           Africa      0.122    0.0533   0.00280     0.240
## 11 Hungary          Europe      0.124    0.0199   0.0794      0.168
## 12 Cote d'Ivoire    Africa      0.131    0.0657  -0.0157      0.277
## 13 Norway           Europe      0.132    0.00819  0.114       0.150
## 14 Slovak Republic  Europe      0.134    0.0217   0.0856      0.182
## 15 Netherlands      Europe      0.137    0.00582  0.124       0.150
## 16 Czech Republic   Europe      0.145    0.0138   0.114       0.176
## 17 Bulgaria         Europe      0.146    0.0420   0.0522      0.239
## 18 Burundi          Africa      0.154    0.0269   0.0941      0.214
## 19 Paraguay         Americas    0.157    0.00655  0.143       0.172
## 20 Romania          Europe      0.157    0.0245   0.103       0.212

We can plot the estimates for the intercept and the slope for all countries, faceted by continent.

  ggplot(data = intercepts, 
         aes(x = country, y = estimate, fill=continent))+
  geom_col()+
  coord_flip()+
  theme_minimal(6)+
  facet_wrap(~continent, scales="free")+
  labs(title = 'Life expectancy in 1952',
       caption = 'Source: Gapminder package',
       x = "") +
  theme(legend.position="none")

  ggplot(data = slopes, 
         aes(x = country, y = estimate, fill = continent))+
  geom_col()+
  coord_flip()+
  theme_minimal(6)+
  facet_wrap(~continent, scales="free")+
  labs(title = 'Average yearly improvement in life expectancy, 1952-2007',
       caption = 'Source: Gapminder package', 
       x = "") +
  theme(legend.position="none")

Finally, as in every regression model, the slopes are estimates for the yearly improvement in life expectancy. If we wanted to, we can plot the confidence interval for each country’s improvement.

ggplot(data = slopes, 
       aes(x = country, y = estimate, colour = continent))+
  geom_pointrange(aes(ymin = conf.low, ymax = conf.high))+
  coord_flip()+
  theme_minimal(6)+
  facet_wrap(~continent, scales="free")+
  labs(title = 'Average yearly improvement in life expectancy, 1952-2007',
       caption = 'Source: Gapminder package', 
       x = "") +
  theme(legend.position="none")