This tutorial demonstrates how to download and analyze higher education data using the College Scorecard API. The College Scorecard provides data on college costs, student outcomes, and institutional characteristics for colleges and universities across the United States.
We’ll focus on University of Oregon data and make comparisons with other institutions.
Before running this code, you need to get a free API key:
Run this code once to install the necessary packages:
# Only run this once
install.packages("rscorecard")
install.packages("tidyverse")
# Load required packages
library(rscorecard)
library(tidyverse)
library(knitr)
# Set your API key here
# Replace 'YOUR_API_KEY_HERE' with your actual key from https://api.data.gov/signup/
sc_key('krza60yel5q738DvsMBdLrVGzBJsRLIIpMAQgdJ1')
# Set chunk options
knitr::opts_chunk$set(
echo = TRUE,
message = FALSE,
warning = FALSE,
fig.width = 10,
fig.height = 6
)
Let’s start by getting basic information about the University of Oregon.
# Get basic UO data - using unitid for exact match
uo_data <- sc_init() %>%
sc_filter(unitid == 209551) %>% # UO's specific unitid
sc_select(unitid, instnm, stabbr, city, locale) %>%
sc_year("latest") %>%
sc_get()
# Display the data
kable(uo_data, caption = "Basic University of Oregon Information")
| unitid | instnm | stabbr | city | locale | year |
|---|---|---|---|---|---|
| 209551 | University of Oregon | OR | Eugene | 12 | latest |
Now let’s examine how enrollment and costs have changed at UO from 2010 to 2020.
# Get UO enrollment and cost data over time
# Need to get each year separately and combine
years_to_get <- 2010:2020
uo_trends <- map_dfr(years_to_get, function(yr) {
sc_init() %>%
sc_filter(unitid == 209551) %>%
sc_select(instnm, ugds, costt4_a, tuitionfee_in, tuitionfee_out) %>%
sc_year(yr) %>%
sc_get()
})
# Convert year to numeric for plotting
uo_trends <- uo_trends %>%
mutate(year_numeric = as.numeric(year))
# Display the data
kable(uo_trends %>% select(-year_numeric),
caption = "University of Oregon Enrollment and Cost Trends (2010-2020)",
col.names = c("Institution", "Total Undergrads", "Avg Cost",
"In-State Tuition", "Out-of-State Tuition", "Year"))
| Institution | Total Undergrads | Avg Cost | In-State Tuition | Out-of-State Tuition | Year |
|---|---|---|---|---|---|
| University of Oregon | 19219 | 19343 | 8190 | 25830 | 2010 |
| University of Oregon | 20248 | 20343 | 8789 | 27653 | 2011 |
| University of Oregon | 20464 | 21638 | 9310 | 28660 | 2012 |
| University of Oregon | 20473 | 22390 | 9763 | 29788 | 2013 |
| University of Oregon | 20252 | 23055 | 9918 | 30888 | 2014 |
| University of Oregon | 20220 | 23373 | 10289 | 32024 | 2015 |
| University of Oregon | 19775 | 24209 | 10761 | 33441 | 2016 |
| University of Oregon | 19163 | 24570 | 11931 | 34611 | 2017 |
| University of Oregon | 18923 | 25126 | 11898 | 35478 | 2018 |
| University of Oregon | 18743 | 26205 | 12720 | 36615 | 2019 |
| University of Oregon | 17972 | 27717 | 13857 | 39309 | 2020 |
ggplot(uo_trends, aes(x = year, y = ugds)) +
geom_line(size = 1.5, color = "#154733") + # UO green
geom_point(size = 3, color = "#FEE123") + # UO yellow
labs(title = "University of Oregon Undergraduate Enrollment Trends",
subtitle = "2010-2020",
x = "Year",
y = "Total Undergraduate Enrollment") +
scale_x_continuous(breaks = 2010:2020) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1))
# Reshape data for plotting
uo_tuition_long <- uo_trends %>%
select(year, tuitionfee_in, tuitionfee_out) %>%
pivot_longer(cols = c(tuitionfee_in, tuitionfee_out),
names_to = "tuition_type",
values_to = "amount") %>%
mutate(tuition_type = recode(tuition_type,
tuitionfee_in = "In-State",
tuitionfee_out = "Out-of-State"))
# Create the plot
ggplot(uo_tuition_long, aes(x = year, y = amount, color = tuition_type)) +
geom_line(size = 1.5) +
geom_point(size = 3) +
labs(title = "University of Oregon Tuition Trends",
subtitle = "In-State vs Out-of-State Tuition & Fees (2010-2020)",
x = "Year",
y = "Tuition & Fees ($)",
color = "Student Type") +
scale_color_manual(values = c("In-State" = "#154733",
"Out-of-State" = "#FEE123")) +
scale_x_continuous(breaks = 2010:2020) +
scale_y_continuous(labels = scales::dollar) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
Let’s compare UO to other major Oregon universities.
# Compare UO to Oregon State and Portland State
# Compare UO to Oregon State and Portland State using unitids
oregon_comparison <- sc_init() %>%
sc_filter(unitid == c(209551, 209542, 209807)) %>% # UO, OSU, PSU
sc_select(instnm, ugds, adm_rate, sat_avg,
tuitionfee_in, pctfloan) %>%
sc_year("latest") %>%
sc_get()
# Display the comparison
kable(oregon_comparison,
digits = 2,
caption = "Comparison of Major Oregon Universities",
col.names = c("Institution", "Enrollment", "Admission Rate",
"Avg SAT", "In-State Tuition", "% Federal Loans", "Year"))
| Institution | Enrollment | Admission Rate | Avg SAT | In-State Tuition | % Federal Loans | Year |
|---|---|---|---|---|---|---|
| Oregon State University | 29557 | 0.79 | NA | 13494 | 0.33 | latest |
| University of Oregon | 19758 | 0.85 | 1257 | 15669 | 0.31 | latest |
| Portland State University | 13619 | 0.91 | NA | 11238 | 0.33 | latest |
# Create a comparison plot
oregon_comparison %>%
ggplot(aes(x = reorder(instnm, -ugds), y = ugds, fill = instnm)) +
geom_col() +
geom_text(aes(label = scales::comma(ugds)),
vjust = -0.5, size = 4) +
labs(title = "Undergraduate Enrollment at Major Oregon Universities",
x = NULL,
y = "Total Undergraduate Enrollment") +
scale_y_continuous(labels = scales::comma,
expand = expansion(mult = c(0, 0.1))) +
scale_fill_manual(values = c("University of Oregon" = "#154733",
"Oregon State University" = "#DC4405",
"Portland State University" = "#00573F")) +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 15, hjust = 1),
plot.title = element_text(face = "bold", size = 14))
Let’s see how UO compares to other Pac-12 universities.
# Get Pac-12 universities data using unitids
pac12_unitids <- c(209551, # University of Oregon
236948, # University of Washington-Seattle
110662, # UCLA
110635, # UC Berkeley
123961, # USC
243744, # Stanford
126614, # Colorado Boulder
104179, # Arizona
104151, # Arizona State-Tempe
230764) # Utah
pac12 <- sc_init() %>%
sc_filter(unitid %in% pac12_unitids) %>%
sc_select(instnm, ugds, adm_rate, sat_avg,
md_earn_wne_p10, grad_debt_mdn) %>%
sc_year("latest") %>%
sc_get()
# Display the data
kable(pac12,
digits = 2,
caption = "Pac-12 Universities Comparison",
col.names = c("Institution", "Enrollment", "Admission Rate",
"Avg SAT", "Median Earnings (10yr)",
"Median Debt", "Year"))
| Institution | Enrollment | Admission Rate | Avg SAT | Median Earnings (10yr) | Median Debt | Year |
|---|---|---|---|---|---|---|
| Arizona State University Campus Immersion | 64398 | 0.90 | NA | 62668 | 19500 | latest |
| University of Arizona | 40769 | 0.86 | 1261 | 59979 | 19620 | latest |
| University of California-Berkeley | 33073 | 0.12 | NA | 92446 | 13000 | latest |
| University of California-Los Angeles | 33040 | 0.09 | NA | 82511 | 14000 | latest |
| University of Southern California | 20817 | 0.10 | 1501 | 92498 | 18000 | latest |
| University of Colorado Boulder | 31578 | 0.83 | 1353 | 69738 | 19500 | latest |
| University of Oregon | 19758 | 0.85 | 1257 | 61324 | 20139 | latest |
| University of Utah | 26041 | 0.87 | 1231 | 67170 | 19000 | latest |
| University of Washington-Seattle Campus | 31588 | 0.43 | NA | 78466 | 14615 | latest |
| Stanford University | 7841 | 0.04 | 1553 | 124080 | 12000 | latest |
pac12 %>%
filter(!is.na(adm_rate)) %>%
mutate(instnm_short = str_remove(instnm, "University of |University-|Campus")) %>%
ggplot(aes(x = reorder(instnm_short, adm_rate), y = adm_rate)) +
geom_col(aes(fill = instnm == "University of Oregon")) +
geom_text(aes(label = scales::percent(adm_rate, accuracy = 1)),
hjust = -0.2, size = 3.5) +
coord_flip() +
labs(title = "Admission Rates at Pac-12 Universities",
x = NULL,
y = "Admission Rate") +
scale_y_continuous(labels = scales::percent,
expand = expansion(mult = c(0, 0.15))) +
scale_fill_manual(values = c("TRUE" = "#154733", "FALSE" = "gray70")) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 14))
Let’s examine the demographic composition of UO’s student body.
# Get UO demographic data
uo_demographics <- sc_init() %>%
sc_filter(unitid == 209551) %>%
sc_select(instnm, ugds, ugds_white, ugds_black,
ugds_hisp, ugds_asian, ugds_aian, ugds_nhpi,
pctpell, par_ed_pct_1stgen) %>%
sc_year("latest") %>%
sc_get()
# Display the data
kable(uo_demographics,
digits = 3,
caption = "University of Oregon Student Demographics",
col.names = c("Institution", "Total Enrollment", "% White",
"% Black", "% Hispanic", "% Asian", "% Am Indian/AK Native",
"% Native HI/Pac Isl", "% Pell Grant",
"% First Gen", "Year"))
| Institution | Total Enrollment | % White | % Black | % Hispanic | % Asian | % Am Indian/AK Native | % Native HI/Pac Isl | % Pell Grant | % First Gen | Year |
|---|---|---|---|---|---|---|---|---|---|---|
| University of Oregon | 19758 | 0.618 | 0.027 | 0.158 | 0.068 | 0.005 | 0.004 | 0.22 | 0.284 | latest |
# Reshape demographic data for plotting
# Reshape demographic data for plotting
uo_race_data <- uo_demographics %>%
select(starts_with("ugds_")) %>%
pivot_longer(everything(),
names_to = "race_category",
values_to = "proportion") %>%
filter(!is.na(proportion), proportion > 0) %>%
mutate(race_category = recode(race_category,
ugds_white = "White",
ugds_black = "Black",
ugds_hisp = "Hispanic",
ugds_asian = "Asian",
ugds_aian = "Am Indian/AK Native",
ugds_nhpi = "Native HI/Pac Isl"))
# Create pie chart
ggplot(uo_race_data, aes(x = "", y = proportion, fill = race_category)) +
geom_col(width = 1, color = "white") +
coord_polar("y") +
labs(title = "University of Oregon Student Body by Race/Ethnicity",
fill = "Race/Ethnicity") +
scale_fill_brewer(palette = "Set2") +
theme_void() +
theme(plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
legend.position = "right")
Here’s a more comprehensive dataset with multiple outcome measures.
# Get comprehensive UO data
uo_complete <- sc_init() %>%
sc_filter(unitid == 209551) %>%
sc_select(
# Basic info
instnm, city, stabbr, locale,
# Enrollment
ugds, ugds_women,
# Admissions
adm_rate, sat_avg, actcmmid,
# Costs
costt4_a, tuitionfee_in, tuitionfee_out,
# Aid
pctpell, pctfloan,
# Outcomes
c150_4, ret_ft4, grad_debt_mdn, md_earn_wne_p10
) %>%
sc_year("latest") %>%
sc_get()
# Display the data
kable(uo_complete,
digits = 2,
caption = "Comprehensive University of Oregon Data")
| instnm | city | stabbr | locale | ugds | ugds_women | adm_rate | sat_avg | actcmmid | costt4_a | tuitionfee_in | tuitionfee_out | pctpell | pctfloan | c150_4 | ret_ft4 | grad_debt_mdn | md_earn_wne_p10 | year |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| University of Oregon | Eugene | OR | 12 | 19758 | 0.56 | 0.85 | 1257 | NA | 32636 | 15669 | 43302 | 0.22 | 0.31 | 0.71 | 0.85 | 20139 | 61324 | latest |
Save your data for further analysis or sharing.
# Create a data directory if it doesn't exist
if (!dir.exists("data")) {
dir.create("data")
}
# Export datasets to CSV
write.csv(uo_trends, "data/uo_trends.csv", row.names = FALSE)
write.csv(oregon_comparison, "data/oregon_universities.csv", row.names = FALSE)
write.csv(pac12, "data/pac12_comparison.csv", row.names = FALSE)
write.csv(uo_demographics, "data/uo_demographics.csv", row.names = FALSE)
write.csv(uo_complete, "data/uo_comprehensive.csv", row.names = FALSE)
cat("Data files saved to the 'data' directory!\n")
sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.49 lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
## [5] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [9] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 rscorecard_0.32.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_1.9.0 compiler_4.4.1 tidyselect_1.2.1
## [5] jquerylib_0.1.4 scales_1.3.0 yaml_2.3.10 fastmap_1.2.0
## [9] R6_2.6.1 labeling_0.4.3 generics_0.1.3 curl_5.2.1
## [13] munsell_0.5.1 RColorBrewer_1.1-3 bslib_0.9.0 pillar_1.10.1
## [17] tzdb_0.4.0 rlang_1.1.4 cachem_1.1.0 stringi_1.8.4
## [21] xfun_0.49 sass_0.4.9 lazyeval_0.2.2 timechange_0.3.0
## [25] cli_3.6.3 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
## [29] grid_4.4.1 rstudioapi_0.16.0 hms_1.1.3 lifecycle_1.0.4
## [33] vctrs_0.6.5 evaluate_1.0.3 glue_1.8.0 farver_2.1.2
## [37] colorspace_2.1-1 rmarkdown_2.29 httr_1.4.7 tools_4.4.1
## [41] pkgconfig_2.0.3 htmltools_0.5.8.1