Section 2 of my Battle of the Bands text mining project is a brief exploration of the lyrics dataset to get acquainted with the data and perform any transformations that might be useful in subsequent analyses.

The dataset consists of 466 songs spread fairly even across the three bands. Queen’s members collaborated on only 15 songs. Freddie Mercury and Brian May wrote most songs, but even John Deacon wrote 16 songs.

lyrics <- lyrics %>% arrange(band, writer) %>% 
  mutate(
    writer = fct_inorder(writer),
    writer = fct_relevel(writer, "Queen", after = 5)
  )
lyrics %>% 
  group_by(band, writer) %>%
  summarize(.groups = "drop",
    albums = n_distinct(album),
    songs = n()
  ) %>%
  adorn_totals() %>%
  adorn_percentages(denominator = "col") %>%
  adorn_pct_formatting() %>%
  adorn_ns() %>%
  flextable::flextable() %>%
  flextable::autofit()

Song Counts

The bands have substantial overlap in productive years. All three bands had a flurry of output over their first half-decade or so, then gradually slowed their pace. AC/DC is still churning out music!

lyrics %>% 
  count(band, released, album) %>%
  mutate(
    released = case_when(
      album == "Fly by Night" ~ as.integer(1974), 
      album == "Queen" ~ as.integer(1972),
      album == "Queen II" ~ as.integer(1973),
      TRUE ~ released)) %>% 
  ggplot(aes(x = released, y = n, fill = band)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = band_palette) +
  geom_text(aes(label = album, y = 0.5), angle = 90, hjust = "bottom", vjust = .25,
            size = 3, color = "grey50") +
  scale_x_continuous(limits = c(1970, 2025), 
                     breaks = seq(1970, 2025, by = 5), 
                     minor_breaks = 1970:2025, 
                     expand = c(0,0)) +
  theme_light() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  ) +
  labs(x = "Released", y = "Songs on Album", 
       title = glue("{n_distinct(lyrics$album)} albums, {nrow(lyrics)} songs.")) +
  facet_grid(rows = vars(band))

Line Counts

lyrics_1 <- lyrics %>%
  inner_join(lyrics_lines %>% count(song_id, name = "n_lines"), by = "song_id")

lc_quant_by_writer <- lyrics_1 %>%
  split(lyrics_1$writer) %>% 
  lapply(function(x) quantile(x$n_lines)) %>%
  bind_rows(.id = "writer")

lc_quant <- quantile(lyrics_1$n_lines)
lc_quant_df <- lc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
lc_quant_tbl <- bind_rows(lc_quant_by_writer, lc_quant_df)

shortest_songs <- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(song)
shortest_song_writer <- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(writer)
longest_songs <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(song)
longest_song_writer <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(writer)

The overall IQR of lines per song was 36 to 53 lines, and those quantiles were similar among the writers. The median song length for each writer ranged from 42 to 51 lines. Freddie Mercury and Queen had the longest median line counts, AC/DC and Brian May the shortest.

lc_quant_tbl %>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::set_caption("Song Lines Quantiles")

Brian May and Freddie Mercury wrote the shortest songs. Dear Friends (Brian May) and Bijou (Freddie Mercury) were 9 lines. The longest song was The Fountain of Lamneth by Neil Peart (189 lines).

p <- lyrics_1 %>% 
  ggplot(aes(x = released, y = n_lines, group = as.factor(released), color = band,
             text = glue("Band: {band} <br>",
                         "Album: {album} <br>",
                         "Lyrics: {writer} <br>",
                         "Song: {song} <br>",
                         "Lines: {n_lines}"))) +
  geom_boxplot() +
  geom_jitter(height = 0, size = 2, alpha = 0.6) +
  # scale_fill_manual(values = band_palette) +
  scale_color_manual(values = band_palette) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = NULL, y = "Line Count",
    title = glue("Line count IQR is {lc_quant['25%']} to {lc_quant['75%']} lines.")
  ) +
  facet_wrap(vars(writer))

ggplotly(p, tooltip = "text")

Word Counts

The tidytext package splits the text into “tokens” (words).

word_count <- lyrics_1 %>% 
  unnest_tokens(output = "word", input = "lyrics", token = "words") %>%
  count(song_id, name = "n_words")

lyrics_2 <- lyrics_1 %>%
  inner_join(word_count, by = "song_id")

wc_quant_by_writer <- lyrics_2 %>%
  split(lyrics_1$writer) %>% 
  lapply(function(x) quantile(x$n_words)) %>%
  bind_rows(.id = "writer")

wc_quant <- quantile(lyrics_2$n_words)
wc_quant_df <- wc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
wc_quant_tbl <- bind_rows(wc_quant_by_writer, wc_quant_df)

shortest_wc_song <- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(song)
shortest_wc_song_writer <- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(writer)
longest_wc_song <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(song)
longest_wc_song_writer <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(writer)

The overall IQR of words per song was 178 to 275 lines. IQRs varied among the writers by about 10%.

wc_quant_tbl %>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::set_caption("Song Words Quantiles")

The shortest song was Bijou (32 words) by Freddie Mercury. The longest song was 2112 (1062 words) by Neil Peart.

p <- lyrics_2 %>% 
  ggplot(aes(x = released, y = n_words, group = as.factor(released), color = band,
             text = glue("Band: {band} <br>",
                         "Album: {album} <br>",
                         "Lyrics: {writer} <br>",
                         "Song: {song} <br>",
                         "Lines: {n_lines} <br>",
                         "Words: {n_words}"))) +
  geom_boxplot() +
  geom_jitter(height = 0, size = 2, alpha = 0.6) +
  scale_color_manual(values = band_palette) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = NULL, y = "Word Count",
    title = glue("Word count IQR is {wc_quant['25%']} to {wc_quant['75%']} words.")
  ) +
  facet_wrap(vars(writer))

ggplotly(p, tooltip = "text")

Save Work

Save the lyrics with summary stats for subsequent steps.

saveRDS(lyrics_2, "./2_lyrics.Rds")