Sampling distributions and the Hartford corrections data

Loading data

We got this data from http://data.hartford.gov/

correction <- read_csv("Correctional_Facility_Daily_Population_Count_By_Facility.csv")

correction <- correction %>%
  rename(name = `Facility Name`) %>%
  rename(accused = `Accused/Other Status Count`) %>%
  rename(sentenced = `Sentenced Status Count`) %>%
  rename(total = `Total Facility Population Count`) %>%
  mutate(Date = mdy(Date))

Where are the most accused?

correction %>%
  df_stats(accused~name, mean) %>%
  arrange(desc(mean_accused))

##                   name mean_accused
## 1          HARTFORD CC 760.17326233
## 2        BRIDGEPORT CC 603.92241300
## 3         NEW HAVEN CC 576.88908273
## 4          CORRIGAN CI 398.67181333
## 5              YORK CI 349.01013125
## 6            WALKER RC 198.43179411
## 7            OSBORN CI 159.13378019
## 8              NIANTIC 143.98761714
## 9               GARNER 137.70524297
## 10           MANSON YI 121.23294463
## 11           MORGAN ST 109.32900081
## 12         NORTHERN CI  89.11250146
## 13          MACDOUGALL  83.53118399
## 14           UNION AVE  72.13440551
## 15      JENNINGS RD DC  71.65614237
## 16           RADGOWSKI  41.28059866
## 17         CHESHIRE CI  34.73041961
## 18                 145  27.50000000
## 19 WILLARD-CYBULSKI CI  26.43879626
## 20          LITCHFIELD  25.29713952
## 21         ROBINSON CI  22.95712056
## 22         BROOKLYN CI  18.61325865
## 23       NIANTIC ANNEX  13.29213483
## 24            GATES CI   9.84133511
## 25          ENFIELD CI   9.67642140
## 26          WEBSTER CI   3.65321546
## 27           BERGIN CI   3.62180669
## 28       EDDY/DWI UNIT   0.50785855
## 29    HARTELL/DWI UNIT   0.39505155
## 30            CYBULSKI   0.06140879
## 31          WILLARD CI   0.01356050
## 32         GREENSVILLE   0.00000000
## 33           HARTELL/F   0.00000000
## 34          MALONEY CI   0.00000000
## 35         W SUB ABUSE   0.00000000
## 36       WALLENS RIDGE   0.00000000

Focus on Hartford

hartford <- correction %>%
  filter(name == "HARTFORD CC")

Plotting

gf_point(accused~Date, data=hartford)

gf_line(accused~Date, data=hartford)

More complicated filtering

I wonder about how Osborn and Enfield look. Let’s filter it.

correction %>%
  filter(name == "OSBORN CI" | name == "ENFIELD CI")

## # A tibble: 21,655 x 5
##    Date       name       accused sentenced total
##    <date>     <chr>        <int>     <int> <int>
##  1 1988-10-01 ENFIELD CI       0       447   447
##  2 1988-10-01 OSBORN CI       12      1407  1419
##  3 1988-10-02 ENFIELD CI       0       447   447
##  4 1988-10-02 OSBORN CI       12      1407  1419
##  5 1988-10-03 ENFIELD CI       0       447   447
##  6 1988-10-03 OSBORN CI       12      1407  1419
##  7 1988-10-04 ENFIELD CI       0       470   470
##  8 1988-10-04 OSBORN CI       12      1399  1411
##  9 1988-10-05 ENFIELD CI       0       470   470
## 10 1988-10-05 OSBORN CI       11      1400  1411
## # ... with 21,645 more rows

Sampling

(we copied this code from Danny’s resampling lesson so we didn’t have to start from scratch!)

Trials <-   
  do(2000) * {
    correction %>%
      sample_n(size = 50, replace=FALSE) %>%
      df_stats( ~ total, mean)
  }

gf_histogram(~mean_total, data=Trials)

What if we changed the sample size?

Trials5 <- 
  do(2000) * {
    correction %>%
      sample_n(size = 5, replace=FALSE) %>%
      df_stats( ~ total, mean)
  }

Trials10 <- 
  do(2000) * {
    correction %>%
      sample_n(size = 10, replace=FALSE) %>%
      df_stats( ~ total, mean)
  }

Trials100 <- 
  do(2000) * {
    correction %>%
      sample_n(size = 100, replace=FALSE) %>%
      df_stats( ~ total, mean)
  }

(Tidying the data)

alltrials <- data.frame(ss5 = Trials5$mean_total, ss10 = Trials10$mean_total, ss100 = Trials100$mean_total)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:Matrix':
## 
##     expand

alltrials <- alltrials %>%
  gather(key=which, value=mean)

Plotting the sampling distributions!

gf_histogram(~mean, data=alltrials, fill = ~ which)

gf_violin(mean~which, data=alltrials)

df_stats(mean~which, data=alltrials, mean, sd)

##   which mean_mean   sd_mean
## 1  ss10  786.0325 130.82385
## 2 ss100  787.1333  41.07875
## 3   ss5  795.0815 188.78064

What about the means of the groups?

Trials %>%
  df_stats(~mean_total, mean)

##   mean_mean_total
## 1         787.365

What was our population mean?

correction %>%
  df_stats(~total, mean)

##   mean_total
## 1   788.0564