################################################################################################## ### PythonImport - importing Python results, combining data, and performing data quality control ################################################################################################## # Code accompanying "Non-targeted tandem mass spectrometry analysis reveals diversity and # variability in aerosol functional groups across multiple sites, seasons, and times of day" # Jenna C. Ditto # September 10, 2019 # Updated November 20, 2019 ################################################################################################## ### Getting started ################################################################################################## # This function looks like: PythonImport <- function(FileName,MZTableFileLocation) # FileName = this should be the identifier for your sample (for example, if your Python results file is called Sample1_Python_FGCount.csv, FileName would be 'Sample1_Python', but you can change this line in the code if you prefer to format your file names some other way) # MZTableFileLocation = the file name associated with where you stored your mztable from the .CEF conversion code # For example: PythonImport('Test1_Pos','Test1_Pos_MSMS') # Important! Set these paths to wherever your file are PythonOutputPath <- "path_for_python_output_files" # these are the results generated by the Ruggeri and Takahama Python script PythonInputPath <- "path_for_python_input_files" # the SiriusImport code produced a CSV containing the Python input file, we need that CSV here (should be all in 1 folder) FormulaInformationPath <- "path_for_formula_information_files" # the SiriusImport code produced a CSV containing extra formula information, we need that CSV here (should all be in 1 folder) MZTablePath <- "general_path_for_mztables" # this is the path for where your mztables are (each mztable should be in a sub-folder associated with each sample, along with the .MGF and .MS files - this is the path of the folder containing ALL sample data) FinalizedDataPath <- "path_for_final_data" # The MZTableFileLocation input in the function is the name of the sub-folder that the code should open to find the mztable you want. # For example, if your data was stored at 'Users/Me/Desktop/MSMSProject', you would set MZTablePath to 'Users/Me/Desktop/MSMSProject', # then the MZTableFileLocation would be each individual sub-folder for each sample (you would have folders inside the 'MSMSProject' folder # called 'Sample1','Sample2', etc. for each sample, so your MZTableLocation would be 'Sample1',etc.). # You can modify these paths however makes most sense with your data storage. ######################################################################################## ### PythonImport - imports Python results and compiles all results ######################################################################################## PythonImport <- function(FileName,MZTableFileLocation){ # Pull in Python results setwd(PythonOutputPath) # put all the files in 1 sub folder somewhere, set working directory to that folder CSVFileName <- paste(FileName,'FGCount_1107.csv',sep="") # this can be changed, depending on what your files are called pythonresults <- read.csv(CSVFileName,header=T) # read in CSVs # Pull in Python input files setwd(PythonInputPath) # put all the files in 1 sub folder somewhere, set working directory to that folder PythonFileName <- paste(FileName,'ForPython.csv',sep="") # this can be changed, depending on what your files are called pythoninput <- read.csv(PythonFileName,header=T) # read in CSVs pythoninput <- pythoninput[,-1] # remove useless first column # Pull in mztable (mztable is generated when converting Agilent CEF file to SIRIUS MS file, and is independent of SIRIUS processing) setwd(paste0(MZTablePath,"/",MZTableFileLocation,sep="")) # put all the files in 1 sub folder somewhere, set working directory to that folder mztable2 <- read.csv('mztable.csv',header=T,na.strings=" ") # read in CSV (they are all labeled the same way) mztable2 <- na.omit(mztable2) mztable2 <- mztable2[order(mztable2$Compound),] mztable2_tocombine <- mztable2[,c(5,6,7,10)] # focus on NeutralMass, Abundance, RT, and Compound columns # Pull in formula information setwd(FormulaInformationPath) # put all the files in 1 sub folder somewhere, set working directory to that folder FormulaFileName <- paste0(FileName,'FormulaInformation.csv') # this can change, depending on what your files are called formula_table <- read.csv(FormulaFileName,header=T) # read in CSVs # Merge relevant tables # Merge Python results with mztable merged_tables <- data.frame(merge(pythonresults,mztable2_tocombine,by.x="compound",by.y="Compound")) # Then add python input (for SMILES) merged_tables2 <- data.frame(merge(merged_tables,pythoninput,by.x='compound',by.y='compound')) # Then add formulas merged_tables3 <- data.frame(merge(merged_tables2,formula_table,by.x=c('compound','SMILES'),by.y=c('Compound','SMILES'))) merged_tables <- merged_tables3 ### From here on out, the code can be customized! # At this point, you can customize which columns you want to keep in your data table and which you want to get rid of # (if you are only interested in looking at a subset of functional groups you searched for). # At this point, you can also adjust the column names in your table if you want to re-format any names rather than using # the same names as in the Python script. # Add some other useful quantities merged_tables$RelAbun <- merged_tables$Abundance/sum(merged_tables$Abundance) merged_tables$O_to_C <- merged_tables$O/merged_tables$C merged_tables$O_to_N <- merged_tables$O/merged_tables$N merged_tables$O_to_S <- merged_tables$O/merged_tables$S merged_tables$N_to_C <- merged_tables$N/merged_tables$C merged_tables$S_to_C <- merged_tables$S/merged_tables$C merged_tables$H_to_C <- merged_tables$H/merged_tables$C ### Calculate volatility (from Li et al, https://www.atmos-chem-phys.net/16/3327/2016/) merged_tables$LogCo <- seq(from=0,to=0) for (i in 1:nrow(merged_tables)){ if (merged_tables$O[i]==0 & merged_tables$N[i]==0 & merged_tables$S[i]==0){ nc=23.8 bc=0.4861 bo=0 bco=0 bn=0 bs=0 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } else if (merged_tables$O[i]!=0 & merged_tables$N[i]==0 & merged_tables$S[i]==0){ nc=22.66 bc=0.4481 bo=1.656 bco=-0.779 bn=0 bs=0 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } else if (merged_tables$O[i]==0 & merged_tables$N[i]!=0 & merged_tables$S[i]==0){ nc=24.59 bc=0.4066 bn=0.9619 bo=0 bco=0 bs=0 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } else if (merged_tables$O[i]!=0 & merged_tables$N[i]!=0 & merged_tables$S[i]==0){ nc=24.13 bc=0.3667 bo=0.7732 bco=-0.07790 bn=1.114 bs=0 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } else if (merged_tables$O[i]!=0 & merged_tables$N[i]==0 & merged_tables$S[i]!=0){ nc=24.06 bc=0.3637 bo=1.327 bco=-0.3988 bs=0.7579 bn=0 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } else if (merged_tables$O[i]!=0 & merged_tables$N[i]!=0 & merged_tables$S[i]!=0){ nc=28.50 bc=0.3848 bo=1.011 bco=0.2921 bn=1.053 bs=1.316 merged_tables$LogCo[i] <- (nc - merged_tables$C[i])*bc - merged_tables$O[i]*bo - 2*bco*(merged_tables$C[i]*merged_tables$O[i])/(merged_tables$C[i] + merged_tables$O[i]) - merged_tables$N[i]*bn - merged_tables$S[i]*bs } } merged_tables$Co <- 10^(merged_tables$LogCo) merged_tables$Volatility <- seq(from=0,to=0) for (i in 1:nrow(merged_tables)){ if (merged_tables$Co[i] >= 3*10^6){ merged_tables$Volatility[i]='VOC' } else if (merged_tables$Co[i]>=300 & merged_tables$Co[i]<3*10^6){ merged_tables$Volatility[i]='IVOC' } else if (merged_tables$Co[i]>=0.3 & merged_tables$Co[i]<300){ merged_tables$Volatility[i]='SVOC' } else if (merged_tables$Co[i]>=3*10^-4 & merged_tables$Co[i]<0.3){ merged_tables$Volatility[i]='LVOC' } else if (merged_tables$Co[i]<3*10^-4){ merged_tables$Volatility[i]='ELVOC' } } # Data QC/QA # Here, you can add a check to see if the number of O, N, S in the molecular formula equal the number of # O, N, S in the functional group count, to ensure that you are not missing functional groups in your count. # Exactly which columns you sum together to calculate the number of O, N, S in your functional group cound # will depend on which functional groups you have in your table. # Formula QC QA - priority flags (you can add others as you wish, see Ditto et al for futher description: https://www.nature.com/articles/s42004-018-0074-3) merged_tables$N_rule_flag <- seq(from=0,to=0) # nitrogen rule merged_tables$H_to_C_flag <- seq(from=0,to=0) # H/C ratio merged_tables$DBE_flag <- seq(from=0,to=0) # DBE flag flags$Sum <- seq(from=0,to=0) # Empty column to sum priority flags merged_tables$Nominal = merged_tables$C*12 + merged_tables$H*1 + merged_tables$O*16 + merged_tables$N*14 + merged_tables$S*32 # calculate integer mass of CHONS compounds (***add other elements here if you're considering them***) merged_tables$H_to_C = merged_tables$H/merged_tables$C for (i in 1:nrow(merged_tables)){ merged_tables$DBE[i] <- 0.5*(2*merged_tables$C[i] + 2 + merged_tables$N[i] - merged_tables$H[i]) } # Nitrogen rule flag, compounds with odd #N must have an odd integer mass (***careful when other elements aside from CHONS are included***) for (i in 1:nrow(merged_tables)){ # careful with this rule for compounds above 500 g/mol if ((merged_tables$Nominal[i] %% 2 != 0) & (merged_tables$N[i] == 2 | merged_tables$N[i] == 0)){ # if the integer mass is odd but the number of nitrogens is even (if you allow more than 3N, you'll need to add to this line) merged_tables$N_rule_flag[i] <- 1 } if ((merged_tables$Nominal[i] %% 2 == 0) & (merged_tables$N[i] == 1 | merged_tables$N[i] == 3)){ # if the integer mass is odd but the number of nitrogens is even (if you allow more than 3N, you'll need to add to this line) merged_tables$N_rule_flag[i] <- 1 } } # H/C flag for (i in 1:nrow(merged_tables)){ if (merged_tables$H_to_C[i] < 0.2 | merged_tables$H_to_C[i] > 3.1){ # flags really low or really high H/C merged_tables$H_to_C_flag[i] <- 1 } } # DBE flag for (i in 1:nrow(merged_tables)){ if (ceiling(merged_tables$DBE[i]) != merged_tables$DBE[i]){ merged_tables$DBE_flag[i]=1 } } for (i in 1:nrow(merged_tables)){ if (merged_tables$DBE[i]<0){ merged_tables$DBE_flag[i]=1 } } # Nominal vs neutral mass check merged_tables$Nominal_Flag <- seq(from=0,to=0) for (i in 1:nrow(merged_tables)){ if (merged_tables$Nominal[i]!= round(merged_tables$NeutralMass[i])){ merged_tables$Nominal_Flag[i]=1 } } # SMILES vs formula check merged_tables$SMILES_Formula_Check_O <- seq(from=0,to=0) merged_tables$SMILES_Formula_Check_N <- seq(from=0,to=0) merged_tables$SMILES_Formula_Check_S <- seq(from=0,to=0) merged_tables$SMILES_Formula_Check_Flag <- seq(from=0,to=0) for (i in 1:nrow(merged_tables)){ merged_tables$SMILES_Formula_Check_O[i] = lengths(regmatches(merged_tables$SMILES[i], gregexpr("O",merged_tables$SMILES[i]))) merged_tables$SMILES_Formula_Check_N[i] = lengths(regmatches(merged_tables$SMILES[i], gregexpr("N", merged_tables$SMILES[i]))) merged_tables$SMILES_Formula_Check_S[i] = lengths(regmatches(merged_tables$SMILES[i], gregexpr("S", merged_tables$SMILES[i]))) if (merged_tables$SMILES_Formula_Check_O[i]!=merged_tables$O[i] | merged_tables$SMILES_Formula_Check_N[i]!=merged_tables$N[i] | merged_tables$SMILES_Formula_Check_S[i]!=merged_tables$S[i]){ merged_tables$SMILES_Formula_Check_Flag[i]=1 } } merged_tables <- merged_tables[merged_tables$N_rule_flag==0 & merged_tables$H_to_C_flag==0,] setwd(FinalizedDataPath) sample_csv_name <- paste(FileName,'_processedFGlist.csv',sep="") # CSV label write.csv(merged_tables,sample_csv_name) # write a final CSV assign(sample_csv_name,merged_tables,.GlobalEnv) # keep this line if you want to be able to access reuslts in the Global Environment return(merged_tables) # you can have the function return whichever table you want }