Source code for dynast.ref

import math
import os
import tempfile
from typing import Dict, Optional

from . import utils
from .logging import logger


[docs]def STAR_genomeGenerate(
    fasta_path: str,
    gtf_path: str,
    index_dir: str,
    n_threads: int = 8,
    memory: int = 16 * 1024**3,
    temp_dir: Optional[str] = None,
) -> Dict[str, str]:
    """Generate a STAR index from a reference.

    Args:
        fasta_path: Path to genome fasta
        gtf_path: Path to GTF annotation
        index_dir: Path to output STAR index
        n_threads: Number of threads, defaults to `8`
        memory: *Suggested* memory to use (this is not guaranteed), in bytes
        temp_dir: Temporary directory

    Retuurns:
        Dictionary of generated index
    """
    if fasta_path.endswith('.gz'):
        plaintext_path = utils.mkstemp(dir=temp_dir)
        logger.warning(f'Decompressing {fasta_path} to {plaintext_path} because STAR requires a plaintext FASTA')
        utils.decompress_gzip(fasta_path, plaintext_path)
        fasta_path = plaintext_path
    if gtf_path.endswith('.gz'):
        plaintext_path = utils.mkstemp(dir=temp_dir)
        logger.warning(f'Decompressing {gtf_path} to {plaintext_path} because STAR requires a plaintext GTF')
        utils.decompress_gzip(gtf_path, plaintext_path)
        gtf_path = plaintext_path

    logger.debug(f'Calculating optimal STAR index parameters for {memory} bytes of memory')
    # Calculate genome length and number of FASTA entries
    genome_length = 0
    n_entries = 0
    with utils.open_as_text(fasta_path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                n_entries += 1
            else:
                genome_length += len(line.strip())

    # Taken from STAR manual (https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
    # genomeSAindexNbases = min(14, log2(GenomeLength)/2 - 1)
    sa_index_n_bases = min(14, int(math.log2(genome_length) / 2 - 1))
    # genomeChrBinNbits = min(18, log2[max(GenomeLength/NumberOfReferences,ReadLength)])
    chr_bin_n_bits = min(18, int(math.log2(genome_length / n_entries)))

    # Adapted from Cellranger
    sa_index_memory = (4**sa_index_n_bases) * 8
    sa_sparse_d = max(
        1, math.ceil((8 * genome_length) / (max(1, memory - 2 * 1024**3) - genome_length - sa_index_memory))
    )
    min_memory = genome_length + sa_index_memory + 3 * 1024**3
    if memory < min_memory:
        raise Exception(f'STAR requires at least {min_memory} to index this reference.')

    logger.debug(
        'Calculated parameters: '
        f'genome_length={genome_length} '
        f'n_entries={n_entries} '
        f'sa_index_n_bases={sa_index_n_bases} '
        f'chr_bin_n_bits={chr_bin_n_bits} '
        f'sa_sparse_d={sa_sparse_d}'
    )

    logger.debug(f'Generating STAR index to {index_dir}')
    command = [utils.get_STAR_binary_path(), '--runMode', 'genomeGenerate']
    command += ['--genomeDir', index_dir]
    command += ['--genomeFastaFiles', fasta_path]
    command += ['--sjdbGTFfile', gtf_path]
    command += ['--runThreadN', n_threads]
    command += ['--limitGenomeGenerateRAM', memory]
    command += ['--genomeSAsparseD', sa_sparse_d]
    command += ['--genomeSAindexNbases', sa_index_n_bases]
    command += ['--genomeChrBinNbits', chr_bin_n_bits]
    command += [
        '--outTmpDir',
        os.path.join(temp_dir, f'{tempfile.gettempprefix()}{next(tempfile._get_candidate_names())}')
    ]

    utils.run_executable(command)

    return {'index': index_dir}


@logger.namespaced('ref')
[docs]def ref(
    fasta_path: str,
    gtf_path: str,
    index_dir: str,
    n_threads: int = 8,
    memory: int = 16 * 1024**3,
    temp_dir: Optional[str] = None
):
    """Main interface for the `ref command.

    Args:
        fasta_path: Path to genome fasta
        gtf_path: Path to GTF annotation
        index_dir: Path to output STAR index
        n_threads: Number of threads, defaults to `8`
        memory: *Suggested* memory to use (this is not guaranteed), in bytes
        temp_dir: Temporary directory
    """
    logger.info(f'Indexing FASTA {fasta_path} and GTF {gtf_path} with STAR')
    index_dir = STAR_genomeGenerate(
        fasta_path, gtf_path, index_dir, n_threads=n_threads, memory=memory, temp_dir=temp_dir
    )