Effortless Data Handling: Find Variables Across Multiple Data Files with R | by Rodrigo M Carrillo Larco, MD, PhD | Nov, 2024
library(tidyverse)
library(stringr)
## STEPS TO USE THESE FUNCTIONS:
## 1. DEFINE THE OBJECT ‘PATH_FILE’, WHICH IS A PATH TO THE DIRECTORY WHERE
## ALL THE DATASETS ARE STORED.
## 2. APPLY THE FUNCTION ‘get_names_labels’ WITH THE PATH. THE FUNCTION WILL
## RETURN A DATAFRAME NAMES ‘names_labels’.
## 3. THE FUNCTION WILL RETURN A DATASET (‘names_labels) SHOWING THE NAMES OF
## THE VARIABLES, THE LABELS, AND THE DATASET. VISUALLY/MANUALLY EXPLORE THE
## DATASET TO SELECT THE VARIABLES WE NEED. CREATE A VECTOR WITH THE NAMES
## OF THE VARIABLES WE NEED, AND NAME THIS VECTOR ‘variables_needed’.
## 4. FROM THE DATASET ‘names_labels’, KEEP ONLY THE ROWS WITH THE VARIABLES WE
## WILL USE (STORED IN THE VECTOR ‘variables_needed’).
## 5. APPLY THE FUNCTION ‘read_and_select’ TO EACH OF THE DATASETS WITH RELEVANT
## VARIABLES. THIS FUNCTION WILL ONLY NEED THE NAME OF THE DATASET, WHICH IS
## STORED IN THE LAST COLUMN OF DATASET ‘names_labels’.
### FUNCTION TO 1) READ ALL DATASETS IN A FOLDER; 2) EXTRACT NAMES AND LABELS;
### 3) PUT NAMES AND LABELS IN A DATASET; AND 4) RETURN THE DATASET. THE ONLY
### INPUT NEEDED IS A PATH TO A DIRECTORY WHERE ALL THE DATASETS ARE STORED.
get_names_labels <- function(path_file){
results_df <- list()
sas_files <- c(
list.files(path = path_file, pattern = “\\.sas7bdat$”)
)
for (i in 1:length(sas_files)) {
print(sas_files[i])
# Read the SAS file
sas_data <- read_sas(paste0(path_file, sas_files[i]))
sas_data <- as.data.frame(sas_data)
# Get the variable names and labels
var_names <- names(sas_data)
labels <- sas_data %>%
map(~attributes(.)$label) %>%
map_chr(~ifelse(is.null(.), NA, .))
# Combine the variable names and labels into a data frame
var_df <- data.frame(
variable_name = var_names,
variable_label = labels,
file_name = sas_files[i],
stringsAsFactors = FALSE
)
# Append the results to the overall data frame
results_df[[i]] <- var_df
}
results_df <- do.call(rbind, results_df)
#return(results_df)
assign(‘names_labels’, results_df, envir = .GlobalEnv)
}
################################################################################
### FUNCTION TO READ EACH DATASET AND KEEP ONLY THE VARIABLES WE SELECTED; THE
### FUNCTION WILL SAVE EACH DATASET IN THE ENVIRONMENT. THE ONLY INPUNT IS THE
### NAME OF THE DATASET.
read_and_select <- function(df_file){
df_tmp <- read_sas(paste0(path_file, df_file))
df_tmp <- df_tmp %>%
select(unique(names_labels[which(names_labels$file_name == df_file), ]$variable_name)) %>%
as.data.frame()
assign(str_extract(df_file, “[^.]+”), df_tmp,envir = .GlobalEnv)
}
################################################################################