1. Build Count Matrix

The build_count_matrix() function builds a fragment-overlap count matrix from paired-end BAM files over user-specified genomic regions.

Parameters

Parameter Type Default Description
bam_path character vector Vector of BAM file paths
regions integer or character Genomic regions to evaluate, either:
  • Integer (bin size): The genome will be segmented into non-overlapping bins of this size, and CUT&Tag counts will be summarized per bin.
  • File path: Genomic intervals will be extracted directly from the file, and counts will be summarized exactly on these regions. When a file is provided, the file must contain valid genomic coordinates. Only one region file is accepted in the current implementation.
out_dir character “./” Output directory
ref_genome character “hg38” Reference genome
sample_name character NULL Optional prefix added to the output filename. If provided, the final Feather file will include this value at the beginning of the filename.
force_chr_coord logical FALSE By default, if the provided regions file contains a gene identifier column (for example, gene_id in gff/tsv files or the 4th “name” column in BED files), this column will be used as the row name instead of the “CHR_start_end” coordinate string.
Setting force_chr_coord = TRUE overrides this behavior and forces the use of genomic coordinates as row names.

Output Files

The generated count matrix is saved as a Feather file following these conventions:

  • Fixed-size bins (numeric regions):
    Count_Matrix_<BINSIZE>.feather (e.g., Count_Matrix_5000.feather)

  • Custom regions file (character regions):
    Count_Matrix_<prefix>.feather, where <prefix> is the basename of the region file (without extension). (e.g., "peaks.bed"Count_Matrix_peaks.feather)

  • When sample_name is provided:
    <sample_name>_<Count_Matrix_xxx>.feather (e.g., C1_Count_Matrix_peaks.feather)

Read Output Files

library(arrow)
df <- read_feather("output/Count_Matrix_800.feather")
df <- as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE)
head(df)
pos H3K27ac-H3K4me3 H3K4me3-H3K4me3 H3K4me1-H3K4me3 H3K4me3-H3K9me3
1 chr1_1_800 0 0 0 0
2 chr1_801_1600 0 0 0 0
3 chr1_1601_2400 0 0 0 0
4 chr1_2401_3200 0 0 0 0
5 chr1_3201_4000 0 0 0 0
6 chr1_4001_4800 0 0 0 0
library(arrow)
df <- read_feather("output/Count_Matrix_800.feather")
df <- as.data.frame(df, stringsAsFactors = FALSE, check.names = FALSE)
rownames(df) <- df$pos
df$pos <- NULL
df_nonzero <- df[ rowSums(df != 0) > 0, ]
head(df_nonzero)
H3K4me3-H3K4me3 H3K27ac-H3K27me3 H3K9me3-H3K9me3 H3K9me2-H3K9me3
chr1_9601_10400 1.4928058 1.8988764 22.8962419 1.8861111
chr1_10401_11200 0.5071942 0.1011236 1.1037581 0.1138889
chr1_12001_12800 0 0 0.1297468 0
chr1_12801_13600 0 0 0.8702532 1

Usage Examples

library(multiEpiCore)

# Basic Usage with Fixed Bins
build_count_matrix(
  bam_path = bam_files,
  regions = 800,
  out_dir = "count_matrix/"
)

# Use Custom Peak Regions
build_count_matrix(
  bam_path = bam_files,
  regions = "peaks/merged_peaks.bed",
  out_dir = "count_matrix/"
)

# Test Data
bam_dirs <- c(
  C1 = "bam/C1",
  C2 = "bam/C2",
  T1 = "bam/T1",
  T2 = "bam/T2"
)

out_dir <- "./count_matrix"
dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)

for (sample_name in names(bam_dirs)) {
  bam_dir <- bam_dirs[[sample_name]]
  bam_paths <- list.files(
    path = bam_dir,
    pattern = "\\.bam$",
    full.names = TRUE
  )
  build_count_matrix(
    bam_path    = bam_paths,
    regions     = 800,
    save_dir    = out_dir,
    sample_name = sample_name
  )
}