Section 2 of my Battle of the Bands text mining project is a brief exploration of the lyrics dataset to get acquainted with the data and perform any transformations that might be useful in subsequent analyses.

The dataset consists of 466 songs spread fairly even across the three bands. Queen’s members collaborated on only 15 songs. Freddie Mercury and Brian May wrote most songs, but even John Deacon wrote 16 songs.

lyrics <- lyrics %>% arrange(band, writer) %>% 
  mutate(
    writer = fct_inorder(writer),
    writer = fct_relevel(writer, "Queen", after = 5)
  )
lyrics %>% 
  group_by(band, writer) %>%
  summarize(.groups = "drop",
    albums = n_distinct(album),
    songs = n()
  ) %>%
  adorn_totals() %>%
  adorn_percentages(denominator = "col") %>%
  adorn_pct_formatting() %>%
  adorn_ns() %>%
  flextable::flextable() %>%
  flextable::autofit()

band	writer	albums	songs
AC/DC	AC/DC	18.0% (16)	37.6% (175)
Queen	Brian May	14.6% (13)	8.4% (39)
Queen	Freddie Mercury	14.6% (13)	10.7% (50)
Queen	John Deacon	12.4% (11)	3.4% (16)
Queen	Roger Taylor	15.7% (14)	4.9% (23)
Queen	Queen	4.5% (4)	3.2% (15)
Rush	Neil Peart	20.2% (18)	31.8% (148)
Total	-	100.0% (89)	100.0% (466)

Song Counts

The bands have substantial overlap in productive years. All three bands had a flurry of output over their first half-decade or so, then gradually slowed their pace. AC/DC is still churning out music!

lyrics %>% 
  count(band, released, album) %>%
  mutate(
    released = case_when(
      album == "Fly by Night" ~ as.integer(1974), 
      album == "Queen" ~ as.integer(1972),
      album == "Queen II" ~ as.integer(1973),
      TRUE ~ released)) %>% 
  ggplot(aes(x = released, y = n, fill = band)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = band_palette) +
  geom_text(aes(label = album, y = 0.5), angle = 90, hjust = "bottom", vjust = .25,
            size = 3, color = "grey50") +
  scale_x_continuous(limits = c(1970, 2025), 
                     breaks = seq(1970, 2025, by = 5), 
                     minor_breaks = 1970:2025, 
                     expand = c(0,0)) +
  theme_light() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5)
  ) +
  labs(x = "Released", y = "Songs on Album", 
       title = glue("{n_distinct(lyrics$album)} albums, {nrow(lyrics)} songs.")) +
  facet_grid(rows = vars(band))

Line Counts

lyrics_1 <- lyrics %>%
  inner_join(lyrics_lines %>% count(song_id, name = "n_lines"), by = "song_id")

lc_quant_by_writer <- lyrics_1 %>%
  split(lyrics_1$writer) %>% 
  lapply(function(x) quantile(x$n_lines)) %>%
  bind_rows(.id = "writer")

lc_quant <- quantile(lyrics_1$n_lines)
lc_quant_df <- lc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
lc_quant_tbl <- bind_rows(lc_quant_by_writer, lc_quant_df)

shortest_songs <- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(song)
shortest_song_writer <- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(writer)
longest_songs <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(song)
longest_song_writer <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(writer)

The overall IQR of lines per song was 36 to 53 lines, and those quantiles were similar among the writers. The median song length for each writer ranged from 42 to 51 lines. Freddie Mercury and Queen had the longest median line counts, AC/DC and Brian May the shortest.

lc_quant_tbl %>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::set_caption("Song Lines Quantiles")

Song Lines Quantiles
writer	0%	25%	50%	75%	100%	band
AC/DC	21	35.00	42	49	88
Brian May	9	33.00	42	53	95
Freddie Mercury	9	39.50	51	61	120
John Deacon	24	33.75	43	48	63
Roger Taylor	29	35.00	48	53	74
Queen	15	46.50	50	57	82
Neil Peart	13	36.75	44	53	189
	9	36.00	44	53	189	Overall

Brian May and Freddie Mercury wrote the shortest songs. Dear Friends (Brian May) and Bijou (Freddie Mercury) were 9 lines. The longest song was The Fountain of Lamneth by Neil Peart (189 lines).

p <- lyrics_1 %>% 
  ggplot(aes(x = released, y = n_lines, group = as.factor(released), color = band,
             text = glue("Band: {band} <br>",
                         "Album: {album} <br>",
                         "Lyrics: {writer} <br>",
                         "Song: {song} <br>",
                         "Lines: {n_lines}"))) +
  geom_boxplot() +
  geom_jitter(height = 0, size = 2, alpha = 0.6) +
  # scale_fill_manual(values = band_palette) +
  scale_color_manual(values = band_palette) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = NULL, y = "Line Count",
    title = glue("Line count IQR is {lc_quant['25%']} to {lc_quant['75%']} lines.")
  ) +
  facet_wrap(vars(writer))

ggplotly(p, tooltip = "text")

Word Counts

The tidytext package splits the text into “tokens” (words).

word_count <- lyrics_1 %>% 
  unnest_tokens(output = "word", input = "lyrics", token = "words") %>%
  count(song_id, name = "n_words")

lyrics_2 <- lyrics_1 %>%
  inner_join(word_count, by = "song_id")

wc_quant_by_writer <- lyrics_2 %>%
  split(lyrics_1$writer) %>% 
  lapply(function(x) quantile(x$n_words)) %>%
  bind_rows(.id = "writer")

wc_quant <- quantile(lyrics_2$n_words)
wc_quant_df <- wc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
wc_quant_tbl <- bind_rows(wc_quant_by_writer, wc_quant_df)

shortest_wc_song <- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(song)
shortest_wc_song_writer <- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(writer)
longest_wc_song <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(song)
longest_wc_song_writer <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(writer)

The overall IQR of words per song was 178 to 275 lines. IQRs varied among the writers by about 10%.

wc_quant_tbl %>%
  flextable::flextable() %>%
  flextable::autofit() %>%
  flextable::set_caption("Song Words Quantiles")

Song Words Quantiles
writer	0%	25%	50%	75%	100%	band
AC/DC	94	167.50	209	254.00	509
Brian May	58	168.50	222	273.50	566
Freddie Mercury	32	200.00	250	294.75	460
John Deacon	106	196.25	227	245.00	330
Roger Taylor	124	174.00	214	250.50	336
Queen	67	207.50	231	268.00	370
Neil Peart	72	190.00	240	286.50	1,062
	32	178.00	226	275.00	1,062	Overall

The shortest song was Bijou (32 words) by Freddie Mercury. The longest song was 2112 (1062 words) by Neil Peart.

p <- lyrics_2 %>% 
  ggplot(aes(x = released, y = n_words, group = as.factor(released), color = band,
             text = glue("Band: {band} <br>",
                         "Album: {album} <br>",
                         "Lyrics: {writer} <br>",
                         "Song: {song} <br>",
                         "Lines: {n_lines} <br>",
                         "Words: {n_words}"))) +
  geom_boxplot() +
  geom_jitter(height = 0, size = 2, alpha = 0.6) +
  scale_color_manual(values = band_palette) +
  theme_light() +
  theme(legend.position = "none") +
  labs(
    x = NULL, y = "Word Count",
    title = glue("Word count IQR is {wc_quant['25%']} to {wc_quant['75%']} words.")
  ) +
  facet_wrap(vars(writer))

ggplotly(p, tooltip = "text")

Save Work

Save the lyrics with summary stats for subsequent steps.

saveRDS(lyrics_2, "./2_lyrics.Rds")

Battle of the Bands: Text Mining Lyrics from Rush, Queen, and AC/DC

Section 2: Data Exploration

Michael Foley

2021-09-28

Song Counts

Line Counts

Word Counts

Save Work