9 Read-Write Files

9.1 Reading CSV file

CSV stands for comma separated values. This is NOT Excel file although most of us probably think it is!

Use the read_csv() function in the readr package (which is included in tidyverse). It will automatically detect column data type and read them as such.

The following data was retried from Federal Reserve Bank of New York on the Labor market for recent college graduates. The data is freely available to download, and was accessed on 10 March 2023.

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

✔ ggplot2 3.3.6     ✔ purrr   0.3.4
✔ tibble  3.1.7     ✔ dplyr   1.0.9
✔ tidyr   1.2.0     ✔ stringr 1.4.0
✔ readr   2.1.2     ✔ forcats 0.5.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()

# https://www.newyorkfed.org/research/college-labor-market/index#/outcomes-by-major
labormarket = read_csv("data/labor-market-recent-college-grads.csv")

Rows: 74 Columns: 6

── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Major
dbl (3): Unemployment Rate, Underemployment Rate, Share with Graduate Degree

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

labormarket %>% glimpse()

Rows: 74
Columns: 6
$ Major                        <chr> "Agriculture", "Animal and Plant Sciences…
$ `Unemployment Rate`          <dbl> 2.4, 4.4, 5.1, 2.1, 4.4, 5.8, 6.5, 8.4, 4…
$ `Underemployment Rate`       <dbl> 52.1, 52.5, 50.2, 29.1, 53.7, 52.7, 47.7,…
$ `Median Wage Early Career`   <dbl> 45000, 42000, 45000, 50000, 45000, 47000,…
$ `Median Wage Mid-Career`     <dbl> 70000, 67000, 68000, 85000, 66000, 75000,…
$ `Share with Graduate Degree` <dbl> 21.0, 34.7, 32.5, 38.7, 49.7, 24.2, 27.0,…

9.2 Writing CSV file

labormarket %>% 
  select(Major, `Unemployment Rate`) %>% 
  write_csv('data/tmp_output_test.csv')

9.3 Reading Excel file

# Need readxl library 
# install.packages('readxl')

library(readxl)

labor = read_xlsx("data/labor-market-for-recent-college-grads-original.xlsx")

New names:
• `` -> `...2`
• `` -> `...3`

What happened?

I read the first sheet on the Excel file. Please open the excel file and see for yourself.

First worksheet of the labor market data

With the read_xlsx() function, we can specify the exact worksheet we want to read from. For now, we want to read from the t1_majors sheet. Let modify the query and run it now.

library(readxl)

labor = read_xlsx("data/labor-market-for-recent-college-grads-original.xlsx", sheet = 't1_majors')

New names:
• `` -> `...2`
• `` -> `...3`
• `` -> `...4`
• `` -> `...5`
• `` -> `...6`

labor %>% glimpse()

Rows: 85
Columns: 6
$ `FEDERAL RESERVE BANK of NEW YORK` <chr> NA, NA, NA, NA, "Chart and Table Da…
$ ...2                               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Un…
$ ...3                               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Un…
$ ...4                               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Me…
$ ...5                               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Me…
$ ...6                               <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Sh…

But this is not exactly what we wanted! More work needed here.

# Default name_reair parameter
labor = read_xlsx("data/labor-market-for-recent-college-grads-original.xlsx", sheet = 't1_majors', range = 'a14:f88')
labor %>% glimpse()

Rows: 74
Columns: 6
$ Major                        <chr> "Agriculture", "Animal and Plant Sciences…
$ `Unemployment Rate`          <dbl> 2.352, 4.430, 5.136, 2.068, 4.390, 5.796,…
$ `Underemployment Rate`       <dbl> 52.093, 52.457, 50.227, 29.123, 53.671, 5…
$ `Median Wage Early Career`   <dbl> 45000, 42000, 45000, 50000, 45000, 47000,…
$ `Median Wage Mid-Career`     <dbl> 70000, 67000, 68000, 85000, 66000, 75000,…
$ `Share with Graduate Degree` <dbl> 20.958, 34.717, 32.461, 38.740, 49.747, 2…

Notice how the names of the columns are created. This is not very useful when we are coding! We want the names to be read without sapces in them. Fortunately, there is a way to do this by setting the .name_repair = 'universal'

labor = read_xlsx("data/labor-market-for-recent-college-grads-original.xlsx", sheet = 't1_majors', range = 'a14:f88',
                  .name_repair = "universal")

New names:
• `Unemployment Rate` -> `Unemployment.Rate`
• `Underemployment Rate` -> `Underemployment.Rate`
• `Median Wage Early Career` -> `Median.Wage.Early.Career`
• `Median Wage Mid-Career` -> `Median.Wage.Mid.Career`
• `Share with Graduate Degree` -> `Share.with.Graduate.Degree`

labor %>% glimpse()

Rows: 74
Columns: 6
$ Major                      <chr> "Agriculture", "Animal and Plant Sciences",…
$ Unemployment.Rate          <dbl> 2.352, 4.430, 5.136, 2.068, 4.390, 5.796, 6…
$ Underemployment.Rate       <dbl> 52.093, 52.457, 50.227, 29.123, 53.671, 52.…
$ Median.Wage.Early.Career   <dbl> 45000, 42000, 45000, 50000, 45000, 47000, 4…
$ Median.Wage.Mid.Career     <dbl> 70000, 67000, 68000, 85000, 66000, 75000, 7…
$ Share.with.Graduate.Degree <dbl> 20.958, 34.717, 32.461, 38.740, 49.747, 24.…

Renaming the columns

labor %>% 
  rename(
    unem_rate = Unemployment.Rate,
    underem_rate = Underemployment.Rate
  ) %>% 
  glimpse()

Rows: 74
Columns: 6
$ Major                      <chr> "Agriculture", "Animal and Plant Sciences",…
$ unem_rate                  <dbl> 2.352, 4.430, 5.136, 2.068, 4.390, 5.796, 6…
$ underem_rate               <dbl> 52.093, 52.457, 50.227, 29.123, 53.671, 52.…
$ Median.Wage.Early.Career   <dbl> 45000, 42000, 45000, 50000, 45000, 47000, 4…
$ Median.Wage.Mid.Career     <dbl> 70000, 67000, 68000, 85000, 66000, 75000, 7…
$ Share.with.Graduate.Degree <dbl> 20.958, 34.717, 32.461, 38.740, 49.747, 24.…

9.4 Reading other statistical data sets

The haven package allowes you to read data from other formats. Currently, the haven package reads data from SPSS, SAS and Stata formats.

For more and most up to date information about the package visit https://haven.tidyverse.org/

9.4.1 Reading SPSS file

The data have been downloaded from https://lo.unisa.edu.au/mod/book/view.php?id=646443&chapterid=106604

2016 Health and Society student health survey data (SPSS format, 36 KB) HLTH1025_2016.sav
2016 Health and Society student health survey data (Excel format, 93 KB): HLTH1025_2016.xlsx
SPSS Health and Society student health survey data_Year (SPSS format, 4 KB): HLTH1025_2016_yr.sav

library(haven)

df_spss_2016 = read_sav('data/spss/HLTH1025_2016.sav')

df_spss_2016 %>% glimpse()

Rows: 306
Columns: 59
$ IDnumber      <dbl> 20160186, 20160011, 20160081, 20160155, 20160182, 201600…
$ age           <dbl> 19, 17, 18, 18, 19, 17, 19, 17, 20, 20, 18, 22, 19, 19, …
$ sex           <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
$ workstat      <dbl> 3, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 3,…
$ increg        <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,…
$ incmnth       <dbl> 0, 0, 300, 0, 600, 0, 0, 0, 1500, 400, 0, 433, 1200, 0, …
$ incwk         <dbl> 0.00000, 0.00000, 69.23077, 0.00000, 138.46154, 0.00000,…
$ housing       <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 3, 3, 2, 2, 4, 5, 2, 2,…
$ living        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1,…
$ homepay       <dbl> 1, 1, 1, 1, 4, 4, 99, 1, 4, 1, 1, 4, 4, 4, 1, 4, 1, 4, 2…
$ homecost      <dbl> 0, 0, 0, 50, 1200, 600, 0, 0, 303, 0, 0, 179, 360, 1128,…
$ homecostwk    <dbl> 0.00000, 0.00000, 0.00000, 11.53846, 276.92308, 138.4615…
$ mobile        <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ mobilepay     <dbl> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0,…
$ mobilecost    <dbl> 30, 0, 0, 82, 0, 0, 50, 30, 40, 0, 0, 0, 29, 52, 60, 90,…
$ mobilecostwk  <dbl> 6.923077, 0.000000, 0.000000, 18.923077, 0.000000, 0.000…
$ transport     <dbl> 50, 0, 20, 60, 10, 14, 30, 20, 60, 8, 20, 5, 90, 3, 30, …
$ food          <dbl> 20, 0, 70, 50, 60, 15, 50, 20, 200, 40, 10, 20, 100, 50,…
$ entertain     <dbl> 20, 0, 0, 0, 20, 0, 0, 20, 0, 50, 0, 20, 50, 0, 50, 30, …
$ privhlth      <dbl> 1, 0, 1, 97, 0, 1, 97, 97, 1, 1, 0, 1, 0, 1, 97, 1, 1, 0…
$ fs_illness    <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,…
$ fs_accident   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ fs_death      <dbl> 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,…
$ fs_mtlillness <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ fs_disability <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ fs_divsep     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ fs_nogetjob   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,…
$ fs_lossofjob  <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ fs_alcdrug    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ fs_witviol    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ fs_absvcrim   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ fs_police     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ fs_gambling   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ famstress     <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,…
$ drivelic      <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,…
$ mvacc         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
$ mvaccinj      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smokeyn       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smokereg      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ smokestat     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ suffact       <dbl> 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 0, 0, 1,…
$ veg           <dbl> 4.0, 3.0, 1.0, 3.0, 3.0, 3.0, 2.0, 4.0, 0.0, 1.0, 2.0, 1…
$ fruit         <dbl> 2.0, 2.0, 3.0, 1.0, 0.5, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 0…
$ medication    <dbl> 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1,…
$ sf1           <dbl> 4, 3, 1, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 5, 3, 3, 3, 3,…
$ height        <dbl> 163.0, 164.0, 153.0, 150.0, 158.0, 150.0, 150.0, 158.0, …
$ weight        <dbl> 9999.0, 9999.0, 35.0, 40.0, 40.0, 41.0, 41.0, 42.0, 43.0…
$ asthma        <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ cancer        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ cvcondition   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ arthritis     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ osteop        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ diabetes      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mtlstress     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ anxiety       <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ depress       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ mtlother      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ mntlcond      <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ mntlcurr      <dbl> 99, 1, 99, 0, 0, 0, 0, 0, 99, 0, 0, 1, 0, 0, 0, 0, 0, 1,…

9.4.2 Reading Stata files

The data have been collected from the US Federal Reserve System website located at this url

The 2019 Survey of Consumer Finances (SCF) is the most recent survey conducted. Below are links to the bulletin article, interactive chartbook, historical bulletin tables, full public dataset, extract dataset, replicate weight files, and documentation.

Please visit the page to learn more about the data and the survey.

df_stata = read_dta('data/scfp2019.dta')

df_stata

# A tibble: 28,885 × 351
     yy1    y1   wgt hhsex   age agecl  educ  edcl married  kids    lf lifecl
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl>  <dbl>
 1     1    11 6120.     2    75     6    12     4       2     0     1      5
 2     1    12 4712.     2    75     6    12     4       2     0     1      5
 3     1    13 5145.     2    75     6    12     4       2     0     1      5
 4     1    14 5298.     2    75     6    12     4       2     0     1      5
 5     1    15 4762.     2    75     6    12     4       2     0     1      5
 6     2    21 3790.     1    50     3     8     2       1     3     1      3
 7     2    22 3799.     1    50     3     8     2       1     3     1      3
 8     2    23 3799.     1    50     3     8     2       1     3     1      3
 9     2    24 3788.     1    50     3     8     2       1     3     1      3
10     2    25 3793.     1    50     3     8     2       1     3     1      3
# … with 28,875 more rows, and 339 more variables: famstruct <dbl>,
#   racecl <dbl>, racecl4 <dbl>, race <dbl>, occat1 <dbl>, occat2 <dbl>,
#   indcat <dbl>, foodhome <dbl>, foodaway <dbl>, fooddelv <dbl>, rent <dbl>,
#   income <dbl>, wageinc <dbl>, bussefarminc <dbl>, intdivinc <dbl>,
#   kginc <dbl>, ssretinc <dbl>, transfothinc <dbl>, penacctwd <dbl>,
#   norminc <dbl>, wsaved <dbl>, saved <dbl>, savres1 <dbl>, savres2 <dbl>,
#   savres3 <dbl>, savres4 <dbl>, savres5 <dbl>, savres6 <dbl>, …

9.4.3 Reading SAS files

Principles of Econometrics, 5th Edition provides many downloadable SAS data sets from their website.

df_sas = read_sas('data/profits.sas7bdat')
df_sas

# A tibble: 163 × 3
   PROFITS   QTR  YEAR
     <dbl> <dbl> <dbl>
 1    29.8     1  1959
 2    32.5     2  1959
 3    29.2     3  1959
 4    28.6     4  1959
 5    31.5     1  1960
 6    29.2     2  1960
 7    27.9     3  1960
 8    26.9     4  1960
 9    26.2     1  1961
10    27.7     2  1961
# … with 153 more rows