Section 2 of my Battle of the Bands text mining project is a brief exploration of the lyrics dataset to get acquainted with the data and perform any transformations that might be useful in subsequent analyses.
The dataset consists of 466 songs spread fairly even across the three bands. Queen’s members collaborated on only 15 songs. Freddie Mercury and Brian May wrote most songs, but even John Deacon wrote 16 songs.
<- lyrics %>% arrange(band, writer) %>%
lyrics mutate(
writer = fct_inorder(writer),
writer = fct_relevel(writer, "Queen", after = 5)
)%>%
lyrics group_by(band, writer) %>%
summarize(.groups = "drop",
albums = n_distinct(album),
songs = n()
%>%
) adorn_totals() %>%
adorn_percentages(denominator = "col") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
::flextable() %>%
flextable::autofit() flextable
band | writer | albums | songs |
AC/DC | AC/DC | 18.0% (16) | 37.6% (175) |
Queen | Brian May | 14.6% (13) | 8.4% (39) |
Queen | Freddie Mercury | 14.6% (13) | 10.7% (50) |
Queen | John Deacon | 12.4% (11) | 3.4% (16) |
Queen | Roger Taylor | 15.7% (14) | 4.9% (23) |
Queen | Queen | 4.5% (4) | 3.2% (15) |
Rush | Neil Peart | 20.2% (18) | 31.8% (148) |
Total | - | 100.0% (89) | 100.0% (466) |
The bands have substantial overlap in productive years. All three bands had a flurry of output over their first half-decade or so, then gradually slowed their pace. AC/DC is still churning out music!
%>%
lyrics count(band, released, album) %>%
mutate(
released = case_when(
== "Fly by Night" ~ as.integer(1974),
album == "Queen" ~ as.integer(1972),
album == "Queen II" ~ as.integer(1973),
album TRUE ~ released)) %>%
ggplot(aes(x = released, y = n, fill = band)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = band_palette) +
geom_text(aes(label = album, y = 0.5), angle = 90, hjust = "bottom", vjust = .25,
size = 3, color = "grey50") +
scale_x_continuous(limits = c(1970, 2025),
breaks = seq(1970, 2025, by = 5),
minor_breaks = 1970:2025,
expand = c(0,0)) +
theme_light() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5)
+
) labs(x = "Released", y = "Songs on Album",
title = glue("{n_distinct(lyrics$album)} albums, {nrow(lyrics)} songs.")) +
facet_grid(rows = vars(band))
<- lyrics %>%
lyrics_1 inner_join(lyrics_lines %>% count(song_id, name = "n_lines"), by = "song_id")
<- lyrics_1 %>%
lc_quant_by_writer split(lyrics_1$writer) %>%
lapply(function(x) quantile(x$n_lines)) %>%
bind_rows(.id = "writer")
<- quantile(lyrics_1$n_lines)
lc_quant <- lc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
lc_quant_df <- bind_rows(lc_quant_by_writer, lc_quant_df)
lc_quant_tbl
<- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(song)
shortest_songs <- lyrics_1 %>% slice_min(order_by = n_lines) %>% pull(writer)
shortest_song_writer <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(song)
longest_songs <- lyrics_1 %>% slice_max(order_by = n_lines) %>% pull(writer) longest_song_writer
The overall IQR of lines per song was 36 to 53 lines, and those quantiles were similar among the writers. The median song length for each writer ranged from 42 to 51 lines. Freddie Mercury and Queen had the longest median line counts, AC/DC and Brian May the shortest.
%>%
lc_quant_tbl ::flextable() %>%
flextable::autofit() %>%
flextable::set_caption("Song Lines Quantiles") flextable
writer | 0% | 25% | 50% | 75% | 100% | band |
AC/DC | 21 | 35.00 | 42 | 49 | 88 | |
Brian May | 9 | 33.00 | 42 | 53 | 95 | |
Freddie Mercury | 9 | 39.50 | 51 | 61 | 120 | |
John Deacon | 24 | 33.75 | 43 | 48 | 63 | |
Roger Taylor | 29 | 35.00 | 48 | 53 | 74 | |
Queen | 15 | 46.50 | 50 | 57 | 82 | |
Neil Peart | 13 | 36.75 | 44 | 53 | 189 | |
9 | 36.00 | 44 | 53 | 189 | Overall |
Brian May and Freddie Mercury wrote the shortest songs. Dear Friends (Brian May) and Bijou (Freddie Mercury) were 9 lines. The longest song was The Fountain of Lamneth by Neil Peart (189 lines).
<- lyrics_1 %>%
p ggplot(aes(x = released, y = n_lines, group = as.factor(released), color = band,
text = glue("Band: {band} <br>",
"Album: {album} <br>",
"Lyrics: {writer} <br>",
"Song: {song} <br>",
"Lines: {n_lines}"))) +
geom_boxplot() +
geom_jitter(height = 0, size = 2, alpha = 0.6) +
# scale_fill_manual(values = band_palette) +
scale_color_manual(values = band_palette) +
theme_light() +
theme(legend.position = "none") +
labs(
x = NULL, y = "Line Count",
title = glue("Line count IQR is {lc_quant['25%']} to {lc_quant['75%']} lines.")
+
) facet_wrap(vars(writer))
ggplotly(p, tooltip = "text")
The tidytext package splits the text into “tokens” (words).
<- lyrics_1 %>%
word_count unnest_tokens(output = "word", input = "lyrics", token = "words") %>%
count(song_id, name = "n_words")
<- lyrics_1 %>%
lyrics_2 inner_join(word_count, by = "song_id")
<- lyrics_2 %>%
wc_quant_by_writer split(lyrics_1$writer) %>%
lapply(function(x) quantile(x$n_words)) %>%
bind_rows(.id = "writer")
<- quantile(lyrics_2$n_words)
wc_quant <- wc_quant %>% as_tibble_row() %>% bind_cols(band = "Overall")
wc_quant_df <- bind_rows(wc_quant_by_writer, wc_quant_df)
wc_quant_tbl
<- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(song)
shortest_wc_song <- lyrics_2 %>% slice_min(order_by = n_words) %>% pull(writer)
shortest_wc_song_writer <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(song)
longest_wc_song <- lyrics_2 %>% slice_max(order_by = n_words) %>% pull(writer) longest_wc_song_writer
The overall IQR of words per song was 178 to 275 lines. IQRs varied among the writers by about 10%.
%>%
wc_quant_tbl ::flextable() %>%
flextable::autofit() %>%
flextable::set_caption("Song Words Quantiles") flextable
writer | 0% | 25% | 50% | 75% | 100% | band |
AC/DC | 94 | 167.50 | 209 | 254.00 | 509 | |
Brian May | 58 | 168.50 | 222 | 273.50 | 566 | |
Freddie Mercury | 32 | 200.00 | 250 | 294.75 | 460 | |
John Deacon | 106 | 196.25 | 227 | 245.00 | 330 | |
Roger Taylor | 124 | 174.00 | 214 | 250.50 | 336 | |
Queen | 67 | 207.50 | 231 | 268.00 | 370 | |
Neil Peart | 72 | 190.00 | 240 | 286.50 | 1,062 | |
32 | 178.00 | 226 | 275.00 | 1,062 | Overall |
The shortest song was Bijou (32 words) by Freddie Mercury. The longest song was 2112 (1062 words) by Neil Peart.
<- lyrics_2 %>%
p ggplot(aes(x = released, y = n_words, group = as.factor(released), color = band,
text = glue("Band: {band} <br>",
"Album: {album} <br>",
"Lyrics: {writer} <br>",
"Song: {song} <br>",
"Lines: {n_lines} <br>",
"Words: {n_words}"))) +
geom_boxplot() +
geom_jitter(height = 0, size = 2, alpha = 0.6) +
scale_color_manual(values = band_palette) +
theme_light() +
theme(legend.position = "none") +
labs(
x = NULL, y = "Word Count",
title = glue("Word count IQR is {wc_quant['25%']} to {wc_quant['75%']} words.")
+
) facet_wrap(vars(writer))
ggplotly(p, tooltip = "text")
Save the lyrics with summary stats for subsequent steps.
saveRDS(lyrics_2, "./2_lyrics.Rds")