-
Notifications
You must be signed in to change notification settings - Fork 1
/
Kfoury_CancerCell_2021_get_data.Rmd
147 lines (116 loc) · 4.09 KB
/
Kfoury_CancerCell_2021_get_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
---
title: "Human prostate cancer bone metastases have an actionable immunosuppressive microenvironment (GSE143791)"
subtitle: '[Kfoury, et al. Cancer Cell (2021)](https://doi.org/10.1016/j.ccell.2021.09.005)'
author: "Developed by [Gabriel Hoffman](http://gabrielhoffman.github.io/)"
date: "Run on `r Sys.time()`"
documentclass: article
output:
html_document:
toc: true
smart: false
vignette: >
%\VignetteIndexEntry{Human prostate cancer bone metastases have an actionable immunosuppressive microenvironment}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
%\usepackage[utf8]{inputenc}
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(
echo = TRUE,
warning=FALSE,
message=FALSE,
error = FALSE,
tidy = FALSE,
dev = c("png", "pdf"),
cache = TRUE,
cache.lazy = FALSE)
```
# Download data
```{r download}
path = "/sc/arion/projects/CommonMind/hoffman/scRNAseq_data/Kfoury_CancerCell_2021/raw/"
dir.create(path)
setwd(path)
system("wget --no-check-certificate https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143791/suppl/GSE143791_RAW.tar")
system("wget --no-check-certificate https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143791/suppl/GSE143791_cell.annotation.csv.gz")
system("wget --no-check-certificate https://ftp.ncbi.nlm.nih.gov/geo/series/GSE143nnn/GSE143791/suppl/GSE143791_cell.annotation.human.csv.gz")
system("tar xvf GSE143791_RAW.tar")
system("wget http://pklab.med.harvard.edu/bonemet/data/all.embedding.csv")
```
# Load packages
```{r load.packages}
library(data.table)
library(SingleCellExperiment)
library(GEOquery)
library(gtools)
library(zellkonverter)
library(stringr)
library(Matrix)
```
# Load data
```{r load.data}
# get metadata for these samples from GEO
info = getGEO("GSE143791")$`GSE143791-GPL20301_series_matrix.txt.gz`
ids = sampleNames(phenoData(info))
# get metadata
info = lapply(ids, function(id){
res = getGEO(id)
values = str_split(res@header$characteristics_ch1, pattern=": ")
data = sapply(values, function(x) x[2])
names(data) = sapply(values, function(x) x[1])
data.frame(geoid = id, t(data))
})
info = do.call(smartbind, info)
info$age = as.numeric(info$age)
info$psa.prior.to.surgery = as.numeric(info$psa.prior.to.surgery)
# create sample names
info$ID = paste0(info$subject.id, "-Benign")
info$Status = rep('Benign', nrow(info))
types = c("Tumor", "Involved", "Distal" )
for( type in types){
i = grep(type, info$tissue.type)
info$ID[i] = paste0(info$subject.id[i], "-", type)
info$Status[i] = type
}
# get file names
files = dir(path, pattern="*.count.csv.gz", full.names=TRUE)
# only keep files in this datasets
# get sample names from this dataset
pattern = paste(info$geoid, collapse="|")
files = files[grep(pattern, files)]
# read counts from files
res = lapply(files, function(file){
message(file)
df = fread(file, showProgress=FALSE)
df = as.data.frame(df[!duplicated(df$V1),])
rownames(df) = df$V1
df = as.matrix(df[,-1])
as(df, "sparseMatrix")
})
geneCounts = do.call(cbind, res)
# get cell annotation file
file = paste0(path, "/GSE143791_cell.annotation.human.csv.gz")
cellAnnotation = as.data.frame(fread(file))
# merge annotations with sample metadata
cellAnnotation$ID = sapply(str_split(cellAnnotation$barcode, "_"), function(x) x[1] )
metadata = merge(cellAnnotation, info, by="ID")
rownames(metadata) = metadata$barcode
# get only shared cell barcodes
keep = intersect(colnames(geneCounts), metadata$barcode)
geneCounts = geneCounts[,keep]
metadata = metadata[keep,]
# same order for both data
idx = match(colnames(geneCounts), rownames(metadata))
# create single SingleCellExperiment
sce = SingleCellExperiment(assays = list(counts = geneCounts),
colData = DataFrame(metadata[idx,]))
# Coordinates
file = paste0(path, "/all.embedding.csv")
df_embeding = fread(file)
colnames(df_embeding) = c("barcode", "Coord1", "Coord2")
idx = match(df_embeding$barcode, colnames(sce))
colData(sce)$Coord1[idx] = df_embeding$Coord1
colData(sce)$Coord2[idx] = df_embeding$Coord2
# save to H5AD file
file = paste0(path,"../Kfoury_CancerCell_2021.h5ad")
writeH5AD(sce, file, compression="lzf")
```