The build_count_matrix() function builds a
fragment-overlap count matrix from paired-end BAM files over
user-specified genomic regions.
| Parameter | Type | Default | Description |
|---|---|---|---|
bam_path
|
character vector | — | Vector of BAM file paths |
regions
|
integer or character | — |
Genomic regions to evaluate, either:
|
out_dir
|
character |
“./”
|
Output directory |
ref_genome
|
character |
“hg38”
|
Reference genome |
sample_name
|
character |
NULL
|
Optional prefix added to the output filename. If provided, the final Feather file will include this value at the beginning of the filename. |
force_chr_coord
|
logical |
FALSE
|
By default, if the provided regions file contains a gene identifier
column (for example, gene_id in gff/tsv files or the 4th
“name” column in BED files), this column will be used as the row name
instead of the “CHR_start_end” coordinate string. Setting force_chr_coord = TRUE overrides this behavior and forces
the use of genomic coordinates as row names.
|
The generated count matrix is saved as a Feather file following these conventions:
Fixed-size bins (numeric
regions):
Count_Matrix_<BINSIZE>.feather (e.g.,
Count_Matrix_5000.feather)
Custom regions file (character
regions):
Count_Matrix_<prefix>.feather, where
<prefix> is the basename of the region file (without
extension). (e.g., "peaks.bed" →
Count_Matrix_peaks.feather)
When sample_name is provided:
<sample_name>_<Count_Matrix_xxx>.feather (e.g.,
C1_Count_Matrix_peaks.feather)
library(arrow)
df <- read_feather("output/Count_Matrix_800.feather")
df <- as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE)
head(df)
| pos | H3K27ac-H3K4me3 | H3K4me3-H3K4me3 | H3K4me1-H3K4me3 | H3K4me3-H3K9me3 | |
|---|---|---|---|---|---|
| 1 | chr1_1_800 | 0 | 0 | 0 | 0 |
| 2 | chr1_801_1600 | 0 | 0 | 0 | 0 |
| 3 | chr1_1601_2400 | 0 | 0 | 0 | 0 |
| 4 | chr1_2401_3200 | 0 | 0 | 0 | 0 |
| 5 | chr1_3201_4000 | 0 | 0 | 0 | 0 |
| 6 | chr1_4001_4800 | 0 | 0 | 0 | 0 |
library(arrow)
df <- read_feather("output/Count_Matrix_800.feather")
df <- as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE)
rownames(df) <- df$pos
df$pos <- NULL
df_nonzero <- df[ rowSums(df != 0) > 0, ]
head(df_nonzero)
| H3K4me3-H3K4me3 | H3K27ac-H3K27me3 | H3K9me3-H3K9me3 | H3K9me2-H3K9me3 | |
|---|---|---|---|---|
| chr1_9601_10400 | 1.4928058 | 1.8988764 | 22.8962419 | 1.8861111 |
| chr1_10401_11200 | 0.5071942 | 0.1011236 | 1.1037581 | 0.1138889 |
| chr1_12001_12800 | 0 | 0 | 0.1297468 | 0 |
| chr1_12801_13600 | 0 | 0 | 0.8702532 | 1 |
library(multiEpiCore)
# Basic Usage with Fixed Bins
build_count_matrix(
bam_path = bam_files,
regions = 800,
out_dir = "count_matrix/"
)
# Use Custom Peak Regions
build_count_matrix(
bam_path = bam_files,
regions = "peaks/merged_peaks.bed",
out_dir = "count_matrix/"
)
# Test Data
bam_dirs <- c(
C1 = "bam/C1",
C2 = "bam/C2",
T1 = "bam/T1",
T2 = "bam/T2"
)
out_dir <- "./count_matrix"
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
for (sample_name in names(bam_dirs)) {
bam_dir <- bam_dirs[[sample_name]]
bam_paths <- list.files(
path = bam_dir,
pattern = "\\.bam$",
full.names = TRUE
)
build_count_matrix(
bam_path = bam_paths,
regions = 800,
save_dir = out_dir,
sample_name = sample_name
)
}