1.2 Tokenize
Most models are based on tokenized text. Even if you are interested in working with bigrams, tokenize into words to clean and regularize the data first.
token_0 <-
hotel_1 %>%
select(review_id, review) %>%
unnest_tokens("word", review)
# Attach word counts back to main data frame, just to aid understanding.
hotel_2 <-
hotel_1 %>%
inner_join(count(token_0, review_id, name = "raw_wordcnt"), by = join_by(review_id))
hotel_2 %>% select(raw_chrcnt, raw_wordcnt) %>% summary()
## raw_chrcnt raw_wordcnt
## Min. : 1 Min. : 1.0
## 1st Qu.: 308 1st Qu.: 56.0
## Median : 521 Median : 95.0
## Mean : 712 Mean : 131.8
## 3rd Qu.: 870 3rd Qu.: 163.0
## Max. :7157 Max. :1331.0