import math
import os
import tempfile
from typing import Dict, Optional
from . import utils
from .logging import logger
[docs]def STAR_genomeGenerate(
fasta_path: str,
gtf_path: str,
index_dir: str,
n_threads: int = 8,
memory: int = 16 * 1024**3,
temp_dir: Optional[str] = None,
) -> Dict[str, str]:
"""Generate a STAR index from a reference.
Args:
fasta_path: Path to genome fasta
gtf_path: Path to GTF annotation
index_dir: Path to output STAR index
n_threads: Number of threads, defaults to `8`
memory: *Suggested* memory to use (this is not guaranteed), in bytes
temp_dir: Temporary directory
Retuurns:
Dictionary of generated index
"""
if fasta_path.endswith('.gz'):
plaintext_path = utils.mkstemp(dir=temp_dir)
logger.warning(f'Decompressing {fasta_path} to {plaintext_path} because STAR requires a plaintext FASTA')
utils.decompress_gzip(fasta_path, plaintext_path)
fasta_path = plaintext_path
if gtf_path.endswith('.gz'):
plaintext_path = utils.mkstemp(dir=temp_dir)
logger.warning(f'Decompressing {gtf_path} to {plaintext_path} because STAR requires a plaintext GTF')
utils.decompress_gzip(gtf_path, plaintext_path)
gtf_path = plaintext_path
logger.debug(f'Calculating optimal STAR index parameters for {memory} bytes of memory')
# Calculate genome length and number of FASTA entries
genome_length = 0
n_entries = 0
with utils.open_as_text(fasta_path, 'r') as f:
for line in f:
if line.startswith('>'):
n_entries += 1
else:
genome_length += len(line.strip())
# Taken from STAR manual (https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
# genomeSAindexNbases = min(14, log2(GenomeLength)/2 - 1)
sa_index_n_bases = min(14, int(math.log2(genome_length) / 2 - 1))
# genomeChrBinNbits = min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)])
chr_bin_n_bits = min(18, int(math.log2(genome_length / n_entries)))
# Adapted from Cellranger
sa_index_memory = (4**sa_index_n_bases) * 8
sa_sparse_d = max(
1, math.ceil((8 * genome_length) / (max(1, memory - 2 * 1024**3) - genome_length - sa_index_memory))
)
min_memory = genome_length + sa_index_memory + 3 * 1024**3
if memory < min_memory:
raise Exception(f'STAR requires at least {min_memory} to index this reference.')
logger.debug(
'Calculated parameters: '
f'genome_length={genome_length} '
f'n_entries={n_entries} '
f'sa_index_n_bases={sa_index_n_bases} '
f'chr_bin_n_bits={chr_bin_n_bits} '
f'sa_sparse_d={sa_sparse_d}'
)
logger.debug(f'Generating STAR index to {index_dir}')
command = [utils.get_STAR_binary_path(), '--runMode', 'genomeGenerate']
command += ['--genomeDir', index_dir]
command += ['--genomeFastaFiles', fasta_path]
command += ['--sjdbGTFfile', gtf_path]
command += ['--runThreadN', n_threads]
command += ['--limitGenomeGenerateRAM', memory]
command += ['--genomeSAsparseD', sa_sparse_d]
command += ['--genomeSAindexNbases', sa_index_n_bases]
command += ['--genomeChrBinNbits', chr_bin_n_bits]
command += [
'--outTmpDir',
os.path.join(temp_dir, f'{tempfile.gettempprefix()}{next(tempfile._get_candidate_names())}')
]
utils.run_executable(command)
return {'index': index_dir}
@logger.namespaced('ref')
[docs]def ref(
fasta_path: str,
gtf_path: str,
index_dir: str,
n_threads: int = 8,
memory: int = 16 * 1024**3,
temp_dir: Optional[str] = None
):
"""Main interface for the `ref command.
Args:
fasta_path: Path to genome fasta
gtf_path: Path to GTF annotation
index_dir: Path to output STAR index
n_threads: Number of threads, defaults to `8`
memory: *Suggested* memory to use (this is not guaranteed), in bytes
temp_dir: Temporary directory
"""
logger.info(f'Indexing FASTA {fasta_path} and GTF {gtf_path} with STAR')
index_dir = STAR_genomeGenerate(
fasta_path, gtf_path, index_dir, n_threads=n_threads, memory=memory, temp_dir=temp_dir
)