Data Wrangling

Author

Shifa Maqsood

library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(dplyr) # data wrangling functions
budget <- read_csv("data/budget.csv", show_col_types = FALSE)

Select

# select single column by name
product_dat <- budget %>% select(product) 

# select single column by number
(product_dat <- budget %>% select(2,3))
# A tibble: 8 × 2
  product sales_2019
  <chr>        <dbl>
1 widgets       2129
2 gadgets        723
3 widgets       1123
4 gadgets       2022
5 widgets       -728
6 gadgets       -423
7 widgets        633
8 gadgets       1204
# select columns individually
sales2019 <- budget %>% select(region, product, sales_2019)

# select columns with colon
sales2019 <- budget %>% select(region:sales_2019)
regions <- budget %>% select(`Sales Region` = 1, 3:6) #renaming columns

head(regions, 2)
# A tibble: 2 × 5
  `Sales Region` sales_2019 sales_2020 expenses_2019 expenses_2020
  <chr>               <dbl>      <dbl>         <dbl>         <dbl>
1 North                2129       -517           822          -897
2 North                 723         77          1037          1115
# de-select individual columns
sales <- budget %>% select(-expenses_2019, -expenses_2020)

# de-select a range of columns
sales <- budget %>% select(-(expenses_2019:expenses_2020))

Filters

# select all rows where region equals North
budget %>% filter(region == "North")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 North  gadgets        723         77          1037        1115 very h… very h…
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# select all rows where expenses_2020 were exactly equal to 200
budget %>% filter(expenses_2020 == 200)
# A tibble: 1 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 South  gadgets       2022       -945          -610         200 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# select all rows where sales_2019 was more than 100
budget %>% filter(sales_2019 > 100)
# A tibble: 6 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 North  gadgets        723         77          1037        1115 very h… very h…
3 South  widgets       1123      -1450          1004         672 high    neutral
4 South  gadgets       2022       -945          -610         200 low     low    
5 West   widgets        633        790           783        -315 neutral neutral
6 West   gadgets       1204        426           433        -136 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# everything but the North
budget %>% filter(region != "North")
# A tibble: 6 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 South  widgets       1123      -1450          1004         672 high    neutral
2 South  gadgets       2022       -945          -610         200 low     low    
3 East   widgets       -728        -51          -801        -342 very l… very l…
4 East   gadgets       -423       -354            94        2036 neutral high   
5 West   widgets        633        790           783        -315 neutral neutral
6 West   gadgets       1204        426           433        -136 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# regions and products with profit in both 2019 and 2020
profit_both <- budget %>% 
  filter(
    sales_2019 > expenses_2019,
    sales_2020 > expenses_2020
  )

# the same as above, using & instead of a comma
profit_both <- budget %>% 
  filter(
    sales_2019 > expenses_2019 &
    sales_2020 > expenses_2020
  )

# regions and products with profit in 2019 or 2020
profit_either <- budget %>% 
  filter(
    sales_2019 > expenses_2019 | #logical operator for or
    sales_2020 > expenses_2020
  )

# 2020 profit greater than 1000
profit_1000 <- budget %>%
  filter(sales_2020 - expenses_2020 > 1000)

%in% operator

# retain any rows where region is north or south, and where product equals widget
budget %>%
  filter(region %in% c("North", "South"),
         product == "widgets")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 South  widgets       1123      -1450          1004         672 high    neutral
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# retain any rows where the region is not east or west, and where the product does not equal gadgets
budget %>%
  filter(!region %in% c("East", "West"),
         product != "gadgets")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 South  widgets       1123      -1450          1004         672 high    neutral
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020

Arrange

budget %>%
  mutate(region = factor(region, levels = c("North", "South", "East", "West"))) %>%
  filter(product == "gadgets") %>%
  arrange(region) # arrange function sorts out data
# A tibble: 4 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <fct>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  gadgets        723         77          1037        1115 very h… very h…
2 South  gadgets       2022       -945          -610         200 low     low    
3 East   gadgets       -423       -354            94        2036 neutral high   
4 West   gadgets       1204        426           433        -136 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020

Mutate

budget2 <- budget %>%
  mutate(
    sales = sales_2019 + sales_2020,
    expenses = expenses_2019 + expenses_2020,
    profit = sales - expenses,
    region = paste(region, "Office")
  )

#mutate with logical operator 
budget2 <- budget2 %>%
  mutate(profit_category = profit > 0,
         product = as.factor(product))

# you do not have to use TRUE or FALSE all the time and you can use case_when function 
budget3 <- budget2 %>%
  mutate(profit_category = case_when(profit > 0 ~ "PROFIT",
                                     profit < 0 ~ "NO PROFIT"))

# use tilda and %in% operator 

# create a column where people get a bonus if customer satisfaction was overall high or very high

bonus <- budget3 %>%
  mutate(bonus_2019 = case_when(satisfaction_2019 %in% c("very low", "low", "neutral") ~ "no bonus",
                                satisfaction_2019 %in% c("high", "very high") ~ "bonus"))

Summarise

## first we need to adjust the data for analysis
budget4 <- budget %>%
  select(-satisfaction_2019, -satisfaction_2020) %>%
  pivot_longer(cols = sales_2019:expenses_2020,
               names_to = c("type", "year"),
               names_sep = "_",
               values_to = "value") %>%
  pivot_wider(names_from = type,
              values_from = value)

head(budget4) # check the format
# A tibble: 6 × 5
  region product year  sales expenses
  <chr>  <chr>   <chr> <dbl>    <dbl>
1 North  widgets 2019   2129      822
2 North  widgets 2020   -517     -897
3 North  gadgets 2019    723     1037
4 North  gadgets 2020     77     1115
5 South  widgets 2019   1123     1004
6 South  widgets 2020  -1450      672
#then summarize it
budget4 %>%
  summarise(
    mean_sales = mean(sales),
    mean_expenses = mean(expenses),
    min_profit = min(expenses - sales),
    max_profit = max(expenses - sales)
  )
# A tibble: 1 × 4
  mean_sales mean_expenses min_profit max_profit
       <dbl>         <dbl>      <dbl>      <dbl>
1       291.          318.      -2632       2390

##Group By

year_prod <- budget4 %>%
  group_by(year, product) %>%
  summarise(
    mean_sales = mean(sales),
    mean_expenses = mean(expenses),
    min_profit = min(expenses - sales),
    max_profit = max(expenses - sales)
  ) %>%
  ungroup()
`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.

Complications

year_prod %>%
  mutate(across(.cols = mean_sales:max_profit, 
                .fns = round))
# A tibble: 4 × 6
  year  product mean_sales mean_expenses min_profit max_profit
  <chr> <chr>        <dbl>         <dbl>      <dbl>      <dbl>
1 2019  gadgets        882           238      -2632        517
2 2019  widgets        789           452      -1307        150
3 2020  gadgets       -199           804       -562       2390
4 2020  widgets       -307          -220      -1105       2122
round(0.5)
[1] 0
round(1.5)
[1] 2
#!!!!!! redefining round so 5s round up !!!!!! 
round <- function(x, digits = 0) {
  posneg = sign(x)
  z = abs(x)*10^digits
  z = z + 0.5 + sqrt(.Machine$double.eps)
  z = trunc(z)
  z = z/10^digits
  z*posneg
}
# remove new round() method
rm(round)
missing_bad <- budget4 %>%
  mutate(expenses = ifelse(
    test = year == 2020 & region == "South", 
    yes = 0, # value if above conditions are met
    no = expenses # value if above conditions are not met
  ))
missing_bad <- budget4 %>%
  mutate(expenses = case_when(
    # set to 0 when year is 2020 and region is North
    year == 2020 & region == "South" ~ 0, 
    # otherwise, set to the value in the expenses column
    TRUE ~ expenses   
  ))
# set sales values to "missing" for North 2020 rows
missing_bad <- missing_bad %>%
  mutate(sales = ifelse(year == 2020 & region == "North", 
                        "missing", 
                        sales))

# check structure of data, sales now character
str(missing_bad)
tibble [16 × 5] (S3: tbl_df/tbl/data.frame)
 $ region  : chr [1:16] "North" "North" "North" "North" ...
 $ product : chr [1:16] "widgets" "widgets" "gadgets" "gadgets" ...
 $ year    : chr [1:16] "2019" "2020" "2019" "2020" ...
 $ sales   : chr [1:16] "2129" "missing" "723" "missing" ...
 $ expenses: num [1:16] 822 -897 1037 1115 1004 ...