Snippet #2: Cleaning column names of an imported csv

R
tutorials
Author

Giorgio Luciano

Published

February 19, 2023

  1. Import data from a csv file
  2. Use the function clean_names from (Firke 2023)j R function
  3. Write a function in base using gsub and regex to tackle specific issues
  4. You’re done

First of all we import the csv using the library (Müller 2020)here

library(here)
here() starts at I:/giorgioluciano.github.io/Blog
file_in <- "FakeData.csv"
path_in <- "posts/013_Clean_csv/"
data <- read.csv(here(path_in,file_in), head=T, check.names=F, encoding="latin1")
library(janitor)

Attaching package: 'janitor'
The following objects are masked from 'package:stats':

    chisq.test, fisher.test
data_fixed <- clean_names(data)

And now the function written by William Doane

clinical_names <- function(.data, unique = FALSE) {
  n <- if (is.data.frame(.data)) colnames(.data) else .data
  n <- gsub("cvrisk", "CVrisk", n , ignore.case=T)
  n <- gsub("hbo", "HBO", n , ignore.case=T)
  n <- gsub("ft4", "fT4", n , ignore.case=T)
  n <- gsub("f_t4", "fT4", n , ignore.case=T)
  n <- gsub("ft3", "fT3", n , ignore.case=T)
  n <- gsub("f_t3", "fT3", n , ignore.case=T)
  n <- gsub("ldl", "LDL", n , ignore.case=T)
  n <- gsub("hdl", "HDL", n , ignore.case=T)
  n <- gsub("hba1c", "HbA1C", n, ignore.case=T)
  n <- gsub("hbac1", "HbA1C", n, ignore.case=T)
  n <- gsub("hb_ac1", "HbA1C",n,ignore.case=T)
  n <- gsub("\\igf\\b", "IGF", n , ignore.case=T)
  n <- gsub("tsh", "TSH", n , ignore.case=T)
  n <- gsub("acth", "ACTH", n, ignore.case=T)
  n <- gsub("\\Na\\b", "Sodio", n)
  n <- gsub("\\K\\b",  "Potassio", n)
  n <- gsub("\\P\\b",  "Fosforo", n)
  n <- gsub("\\pas\\b", "PAS", n, ignore.case=T)
  n <- gsub("\\pad\\b", "PAD", n, ignore.case=T)
  n <- gsub("\\pth\\b", "PTH", n, ignore.case=T)
  n <- gsub("\\clu\\b", "CLU", n, ignore.case=T)
  n <- gsub("\\tg\\b", "TG", n, ignore.case=T)
  n <- gsub("\\glic\\b", "glicemia", n, ignore.case=T)
  if (unique) n <- make.unique(n, sep = "_")
  if (is.data.frame(.data)) {
    colnames(.data) <- n
    .data
  } else {
    n
  }
}
data_clean <- clinical_names(data_fixed)

comparison <- cbind(data.frame((colnames(data))),
                        data.frame((colnames(data_fixed))),
                        data.frame((colnames(data_clean))))

colnames(comparison) <- c("original","fixed","clean") 

comparison
           original             fixed             clean
1          paziente          paziente          paziente
2               età               eta               eta
3               SEX               sex               sex
4          diagnosi          diagnosi          diagnosi
5           terapia           terapia           terapia
6             tempo             tempo             tempo
7            Cvrisk            cvrisk            CVrisk
8              peso              peso              peso
9        delta Peso        delta_peso        delta_peso
10              BMI               bmi               bmi
11         deltaBMI         delta_bmi         delta_bmi
12              PAS               pas               PAS
13         deltaPas         delta_pas         delta_PAS
14              pad               pad               PAD
15         deltaPad         delta_pad         delta_PAD
16              HBO               hbo               HBO
17           neutro            neutro            neutro
18            linfo             linfo             linfo
19             glic              glic          glicemia
20    deltaglicemia     deltaglicemia     deltaglicemia
21            HBAC1             hbac1             HbA1C
22       deltaHbAc1      delta_hb_ac1       delta_HbA1C
23            sodio             sodio             sodio
24         potassio          potassio          potassio
25           calcio            calcio            calcio
26          fosforo           fosforo           fosforo
27      colesterolo       colesterolo       colesterolo
28 deltaColesterolo delta_colesterolo delta_colesterolo
29              HDL               hdl               HDL
30         deltaHDL         delta_hdl         delta_HDL
31              ldl               ldl               LDL
32         deltaLDL         delta_ldl         delta_LDL
33               TG                tg                tg
34          deltaTG          delta_tg          delta_tg
35             ACTH              acth              ACTH
36        cortisolo         cortisolo         cortisolo
37              CLU               clu               CLU
38              IGF               igf               IGF
39              TSH               tsh               TSH
40              fT4              f_t4               fT4
41              PTH               pth               PTH
42       Vitamina D        vitamina_d        vitamina_d
43          dose_CA           dose_ca           dose_ca
44          dose_HC           dose_hc           dose_hc
45          dose_PL           dose_pl           dose_pl
46 dose equivalente  dose_equivalente  dose_equivalente

References

Firke, Sam. 2023. “Janitor: Simple Tools for Examining and Cleaning Dirty Data.” https://CRAN.R-project.org/package=janitor.
Müller, Kirill. 2020. “Here: A Simpler Way to Find Your Files.” https://CRAN.R-project.org/package=here.