8  Data import

Prerequisites

library(tidyverse)
#> ── Attaching core tidyverse packages ───────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ─────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

8.2.4 Exercises

  1. For reading a file delimited with |, use read_delim() with argument delim = "|".

  2. All other arguments are common among the two functions.

  3. col_positions is an important argument since it defines the beginning and end of columns.

  4. We need to specify the quote argument.

    read_csv("x,y\n1,'a,b'", quote = "\'")
    #> Rows: 1 Columns: 2
    #> ── Column specification ─────────────────────────────────────────────────────
    #> Delimiter: ","
    #> chr (1): y
    #> dbl (1): x
    #> 
    #> ℹ Use `spec()` to retrieve the full column specification for this data.
    #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
    #> # A tibble: 1 × 2
    #>       x y    
    #>   <dbl> <chr>
    #> 1     1 a,b
  5. Problems with each read_csv() statement is shown below:\

    • There are only two column headers but three values in each row, so the last two get merged:

      read_csv("a,b\n1,2,3\n4,5,6")
      #> Warning: One or more parsing issues, call `problems()` on your data frame for
      #> details, e.g.:
      #>   dat <- vroom(...)
      #>   problems(dat)
      #> Rows: 2 Columns: 2
      #> ── Column specification ─────────────────────────────────────────────────────
      #> Delimiter: ","
      #> dbl (1): a
      #> num (1): b
      #> 
      #> ℹ Use `spec()` to retrieve the full column specification for this data.
      #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
      #> # A tibble: 2 × 2
      #>       a     b
      #>   <dbl> <dbl>
      #> 1     1    23
      #> 2     4    56
    • There are only three column headers, first row is missing a value in the last column so gets an NA there, the second row has four values so the last two get merged:

      read_csv("a,b,c\n1,2\n1,2,3,4")
      #> Warning: One or more parsing issues, call `problems()` on your data frame for
      #> details, e.g.:
      #>   dat <- vroom(...)
      #>   problems(dat)
      #> Rows: 2 Columns: 3
      #> ── Column specification ─────────────────────────────────────────────────────
      #> Delimiter: ","
      #> dbl (2): a, b
      #> num (1): c
      #> 
      #> ℹ Use `spec()` to retrieve the full column specification for this data.
      #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
      #> # A tibble: 2 × 3
      #>       a     b     c
      #>   <dbl> <dbl> <dbl>
      #> 1     1     2    NA
      #> 2     1     2    34
    • No rows are read in:

      read_csv("a,b\n\"1")
      #> Rows: 0 Columns: 2
      #> ── Column specification ─────────────────────────────────────────────────────
      #> Delimiter: ","
      #> chr (2): a, b
      #> 
      #> ℹ Use `spec()` to retrieve the full column specification for this data.
      #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
      #> # A tibble: 0 × 2
      #> # ℹ 2 variables: a <chr>, b <chr>
    • Each column has a numerical and a character value, so the column type is coerced to character:

      read_csv("a,b\n1,2\na,b")
      #> Rows: 2 Columns: 2
      #> ── Column specification ─────────────────────────────────────────────────────
      #> Delimiter: ","
      #> chr (2): a, b
      #> 
      #> ℹ Use `spec()` to retrieve the full column specification for this data.
      #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
      #> # A tibble: 2 × 2
      #>   a     b    
      #>   <chr> <chr>
      #> 1 1     2    
      #> 2 a     b
    • The delimiter is ; but it’s not specified, therefore this is read in as a single-column data frame with a single observation:

      read_csv("a;b\n1;3")
      #> Rows: 1 Columns: 1
      #> ── Column specification ─────────────────────────────────────────────────────
      #> Delimiter: ","
      #> chr (1): a;b
      #> 
      #> ℹ Use `spec()` to retrieve the full column specification for this data.
      #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
      #> # A tibble: 1 × 1
      #>   `a;b`
      #>   <chr>
      #> 1 1;3
  6. The non-syntactic names can be read in as follows.

    annoying <- tibble(
      `1` = 1:10,
      `2` = `1` * 2 + rnorm(length(`1`))
    )
    1. Extracting the variable called 1:
    annoying |>
      select(`1`)
    #> # A tibble: 10 × 1
    #>     `1`
    #>   <int>
    #> 1     1
    #> 2     2
    #> 3     3
    #> 4     4
    #> 5     5
    #> 6     6
    #> # ℹ 4 more rows
    1. Plotting a scatterplot of 1 vs. 2:
    ggplot(annoying, aes(x = `2`, y = `1`)) +
      geom_point()

    1. Creating a new column called 3, which is 2 divided by 1:
    annoying |>
      mutate(`3` = `2` / `1`)
    #> # A tibble: 10 × 3
    #>     `1`    `2`   `3`
    #>   <int>  <dbl> <dbl>
    #> 1     1  0.600 0.600
    #> 2     2  4.26  2.13 
    #> 3     3  3.56  1.19 
    #> 4     4  7.99  2.00 
    #> 5     5 10.6   2.12 
    #> 6     6 13.1   2.19 
    #> # ℹ 4 more rows
    1. Renaming the columns to one, two, and three:
    annoying |>
      mutate(`3` = `2` / `1`) |>
      rename(
        "one" = `1`,
        "two" = `2`,
        "three" = `3`
      )
    #> # A tibble: 10 × 3
    #>     one    two three
    #>   <int>  <dbl> <dbl>
    #> 1     1  0.600 0.600
    #> 2     2  4.26  2.13 
    #> 3     3  3.56  1.19 
    #> 4     4  7.99  2.00 
    #> 5     5 10.6   2.12 
    #> 6     6 13.1   2.19 
    #> # ℹ 4 more rows