---
title: "Gene Expression Code"
author: "Hailey Johnson & Dr. Bonnie LaFleur"
output: html_document
editor_options: 
  chunk_output_type: inline
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
This code shows how the LOD was incorporate.  Additionally a small number to all gene expression so that the analysis can be done on the log scale.  This code was written for all data, but the only data that was
output for the manuscript were the 7 genes listed in the analysis file.  


```{r, warning=FALSE, message=FALSE}
require(readxl)
require(dplyr)
raw.data1 <- read_excel("2019JAN10_Quantigene 52 Human Plex Assay FFPE_plate1 results_plus_analysis_subset.xlsx",
                        sheet = "QG Analysis", n_max=98) %>%
 mutate(Type=paste0(`1. RAW DATA`, "_1")) %>%
 dplyr::rename(Well = `...2`, Description=`...3`) %>%
  slice(-1)

raw.data2 <- read_excel("2019JAN17_Quantigene 52 Human Plex Assay FFPE_plate2 results_plus_analysis_subset.xlsx",
                        sheet = "QG Analysis", n_max=42) %>%
  mutate(Type=paste0(`1. RAW DATA`, "_2")) %>%
  dplyr::rename(Well = `...2`, Description=`...3`) %>%
  slice(-1)

## merge both without controls and ignoring the plate:
raw.data <- rbind(raw.data1, raw.data2) %>%
  filter(grepl("X",Type)) %>%
  select(Type, everything(),-`1. RAW DATA`) 

raw.data.thymus.samples <- raw.data %>%
  filter(grepl("T|P|LH", Description))
raw.data.thymus.samples$Description <- gsub("T|)|\\(|LH","",raw.data.thymus.samples$Description)
raw.data.thymus.samples$Description <- gsub(".*\\-","",raw.data.thymus.samples$Description)
raw.data.thymus.samples$Description <- gsub(".+? ", "",raw.data.thymus.samples$Description)
raw.data.thymus.samples <- raw.data.thymus.samples %>%
  mutate(Description=ifelse(Description=="9266", "926",
                            ifelse(Description=="22b","222",Description)))
raw.data.thymus.samples[, 4:10] <- raw.data.thymus.samples[, 4:10] %>%
  mutate_if(is.character, as.numeric)

names(raw.data.thymus.samples)[3] <- "DHVI Thymus Tissue ID Number (T#)"

```

### I also read in a patient characteristics file. This file not only had the ID's of each expression value, but thymic characteristics affiliated with each ID we could study.

```{r, warning=FALSE, message= FALSE}
#setwd("/cloud/project/Quantigene Study")

library(readr)
Patient_characteristics <- read_csv("PatientCharacteristics.csv")
```

#### The gene expression data came in two sets (due to the assay not being able to run all samples at once; there were too many). The first set had it's own specified limit of detections, and the second one had it's own as well.

Data set 1's and 2's limit of detections shown here:

```{r, warning=FALSE, message=FALSE, echo=FALSE}
#setwd("/cloud/project/Quantigene Study")

library(readr)
Limit_of_detection1 <- read_csv("LimitOfDetection1_subset.csv")
Limit_of_detection2 <- read_csv("LimitOfDetection2_subset.csv")

```


```{r,warning=FALSE, message= FALSE, echo=FALSE}
library('knitr')
kable(Limit_of_detection1[1, 5:11], format = "markdown")
```

```{r, warning=FALSE, message=FALSE, echo=FALSE}
library('knitr')
kable(Limit_of_detection2[1, 5:11], format = "markdown")
```

### To accommodate for these separate detection limits, I had to separate the expression values into two dataframes. This allowed me to manipulate the expressions based on their unique LOD. 

```{r, warning=FALSE, message= FALSE}
raw.data.thymus.samples1 <- raw.data.thymus.samples[c(1:90), ]
raw.data.thymus.samples2 <- raw.data.thymus.samples[c(91:124), ]

raw.data.thymus.samples1 <- raw.data.thymus.samples1[ ,-c(1:2)]
raw.data.thymus.samples2 <- raw.data.thymus.samples2[ ,-c(1:2)]
```

### Then the hard work began. I first took the first set of samples. Any value that was below their gene's unique Limit of detection was given a value that was 1/2(LOD). This is a statistical way to work with those uncertain values, and a more in depth reasoning for this method can be found here: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3122101/

#### Data set 1
```{r, message=FALSE}
i <- c(2:8) # Check which columns are the genes!
raw.data.thymus.samples1[ , i] <- apply(raw.data.thymus.samples1[ , i], 2,
                                function(x) as.numeric(as.character(x)))

raw.data.thymus.samples1$"KRT14 (15)"  <- ifelse(raw.data.thymus.samples1$ "KRT14 (15)" <72, 36, raw.data.thymus.samples1$ "KRT14 (15)") 
raw.data.thymus.samples1$"KRT8 (30)" <- ifelse(raw.data.thymus.samples1$"KRT8 (30)"<87, 43.5, raw.data.thymus.samples1$"KRT8 (30)")
raw.data.thymus.samples1$"CD1a (34)" <- ifelse(raw.data.thymus.samples1$"CD1a (34)"<79, 39.5, raw.data.thymus.samples1$"CD1a (34)")
raw.data.thymus.samples1$"GAPDH (35)" <- ifelse(raw.data.thymus.samples1$"GAPDH (35)"<71, 35.5, raw.data.thymus.samples1$"GAPDH (35)")
raw.data.thymus.samples1$"CCL 21 (48)" <- ifelse(raw.data.thymus.samples1$"CCL 21 (48)"<77, 38.5, raw.data.thymus.samples1$"CCL 21 (48)")
raw.data.thymus.samples1$"CD3 E (57)" <- ifelse(raw.data.thymus.samples1$"CD3 E (57)"<100, 50, raw.data.thymus.samples1$"CD3 E (57)")
raw.data.thymus.samples1$"CXCL 12 (67)" <- ifelse(raw.data.thymus.samples1$"CXCL 12 (67)"<145, 72.5, raw.data.thymus.samples1$"CXCL 12 (67)")

```


#### Data set 2
```{r, message= FALSE}
i <- c(2:8) # Check which columns are the genes!
raw.data.thymus.samples2[ , i] <- apply(raw.data.thymus.samples2[ , i], 2,
                                        function(x) as.numeric(as.character(x)))
  
raw.data.thymus.samples2$"KRT14 (15)"  <- ifelse(raw.data.thymus.samples2$ "KRT14 (15)"<52, 26, raw.data.thymus.samples2$ "KRT14 (15)") 

raw.data.thymus.samples2$"KRT8 (30)" <- ifelse(raw.data.thymus.samples2$"KRT8 (30)"<219, 109.5, raw.data.thymus.samples2$"KRT8 (30)")
raw.data.thymus.samples2$"IL 22 (33)" <- ifelse(raw.data.thymus.samples2$"IL 22 (33)"<239, 119.5, raw.data.thymus.samples2$"IL 22 (33)")
raw.data.thymus.samples2$"CD1a (34)" <- ifelse(raw.data.thymus.samples2$"CD1a (34)"<166, 83, raw.data.thymus.samples2$"CD1a (34)")
raw.data.thymus.samples2$"GAPDH (35)" <- ifelse(raw.data.thymus.samples2$"GAPDH (35)"<67, 33.5, raw.data.thymus.samples2$"GAPDH (35)")
raw.data.thymus.samples2$"CCL 21 (48)" <- ifelse(raw.data.thymus.samples2$"CCL 21 (48)"<47, 23.5, raw.data.thymus.samples2$"CCL 21 (48)")
raw.data.thymus.samples2$"CD3 E (57)" <- ifelse(raw.data.thymus.samples2$"CD3 E (57)"<67, 33.5, raw.data.thymus.samples2$"CD3 E (57)")


```

### Then.... I put them back together.

```{r, message= FALSE}
thymus.samples.data <- rbind(raw.data.thymus.samples1, raw.data.thymus.samples2)

test <-unique(thymus.samples.data$`DHVI Thymus Tissue ID Number (T#)`)

```

#### Now that I had my uniform set with properly substituted values, I had to actually keep genes with >60% Values above the limit of detection. This is because those values would have accurate information to conduct a proper analysis on. 


##### Figuring out which genes had >60% of values above the limit of detection was a very complex, long, and painful process that was not included in this HTML. But in order to do this, I assigned values which were below the limit of detection with a "0", and values above the limit of detection with a "1". By taking the mean of each gene with these new assigned variables, I was really finding the percentage of values above the LOD! (Since the mean is simply adding up all the 1's, 0's dont count, then divinding by the total amount of values). I could then easily see and write down the genes which needed to be taken out. Out of the 52 genes, 18 had >60% of values above the LOD. This was the code removing those genes: 
```{r, message=FALSE}
aggregated.thymus.samples  <- thymus.samples.data

## Aggregating ID so there are no duplicates                
```{r, warning=FALSE, message=FALSE}

aggregated.thymus.samples <- aggregate(aggregated.thymus.samples[ ,c(2:8)],
                          by=list(aggregated.thymus.samples$"DHVI Thymus Tissue ID Number (T#)"), 
                                  FUN = mean)

aggregated.thymus.samples <-unique(aggregated.thymus.samples$`DHVI Thymus Tissue ID Number (T#)`)
```

## Adding patient characteristics
```{r, warning=FALSE, message=FALSE}
names(aggregated.thymus.samples)[1] <- "DHVI Thymus Tissue ID Number (T#)"
aggregated.thymus.samples <- merge(aggregated.thymus.samples, Patient_characteristics, by="DHVI Thymus Tissue ID Number (T#)")
```

## Taking the log(expression)! This was a very important step as it gives us a reasonable range and helps us see accurate results.

```{r,warning=FALSE, message=FALSE}
aggregated.thymus.samples[, 2:8] <- log10(aggregated.thymus.samples[2:8] + 0.0001) 

```

```{r, warning=FALSE, include=FALSE, message=FALSE}

names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "KRT14 (15)"] <- "KRT14"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "KRT8 (30)"] <- "KRT8"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "CD1a (34)"] <- "CD1a"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "CCL 21 (48)"] <- "CCL.21"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "CD3 E (57)"] <- "CD3.E"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "CXCL 12 (67)"] <- "CXCL.12"
names(aggregated.thymus.samples)[names(aggregated.thymus.samples) == "GAPDH (35)"] <- "GAPDH"



names(aggregated.thymus.samples)

new.expression aggregated.thymus.samples
names(new.expression)[names(new.expression) == "DHVI Thymus Tissue ID Number (T#)"] <- "id"

names(new.expression)
dim(new.expression)

write.csv(new.expression, "new.expression.csv")

```