1.5 Remove Stop Words

Stop words usually add no value, but you should pay attention to what you are dropping. Be ready to add pertinent words back and perhaps drop others.

# Start with a standard list.
stop <- tidytext::stop_words %>%
  # Remove potentially useful words from stop list.
  filter(!word %in% c("appreciate", "room", "first")) %>%
  # Add custom stop words.
  bind_rows(tibble(word = c("hotel", "stay")))

token <- anti_join(token_2, stop, by = "word")

# Most frequently removed words
token_2 %>% 
  anti_join(token, by = join_by(review_id, word)) %>% 
  count(word, sort = TRUE)
## # A tibble: 459 × 2
##    word      n
##    <chr> <int>
##  1 the   12284
##  2 be    10251
##  3 and    7252
##  4 a      6109
##  5 to     4818
##  6 in     3322
##  7 we     3066
##  8 of     2769
##  9 i      2767
## 10 have   2533
## # ℹ 449 more rows