Source code for methylize.diff_meth_regions

import logging
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
#from scipy import stats --- conflicts with input param names
#from joblib import Parallel, delayed, cpu_count
import matplotlib # color maps and changing Agg backend after cpv alters it.
default_backend = matplotlib.get_backend()
import matplotlib.pyplot as plt
import methylprep
import numpy as np
# from cpv.pipeline import pipeline -- copied and modified here
#from methylize.cpv._common import bediter, genomic_control
import methylize
from pathlib import Path
from .progress_bar import * # tqdm, in_notebook

# cpv imports
import sys
import os
import array
from itertools import groupby, cycle
from operator import itemgetter
#import matplotlib
#matplotlib.use('Agg') --- this prevents plot.show(); only for processing, low overhead
import seaborn as sns
sns.set_context("paper")
sns.set_style("dark", {'axes.linewidth': 1})
from functools import cmp_to_key
try:
    cmp
except NameError:
    def cmp(a, b):
        return (a > b) - (a < b)
import toolshed as ts

# app
from .helpers import color_schemes, to_BED, manifest_gene_map
from .diff_meth_pos import manhattan_plot
from .genome_browser import fetch_genes

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)

np.seterr(under='ignore')

__all__ = ['diff_meth_regions']

[docs]def diff_meth_regions(stats, manifest_or_array_type, **kwargs):
    """Calculates and annotates diffentially methylated regions (DMR) using the `combined-pvalues pipeline` and returns list of output files.

comb-p is a command-line tool and a python library that manipulates BED files of possibly irregularly spaced P-values and

(1) calculates auto-correlation,
(2) combines adjacent P-values,
(3) performs false discovery adjustment,
(4) finds regions of enrichment (i.e. series of adjacent low P-values) and
(5) assigns significance to those regions.

ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3496335/

Input Parameters:

    stats: dataframe
        dataframe output from diff_meth_pos()
    manifest_or_array_type: class instance or string
        pass in the manifest, or the name of the array
    filename:
        filename to prepend to the .BED file created.
    creates a <filename>.bed output from diff_meth_pos and to_BED.
    genome_build: default 'NEW'
        by default, it uses the NEWer genome build. Each manifest contains two genome builds,
        marked "NEW" and "OLD". To use the OLD build, se this to "OLD".

Computational Parameters:

    dist: int
        maximum distance from each probe to scan for adjacent peaks (80)
    acf-dist: int
        window-size for smoothing. Default is 1/3 of peak-dist (dist kwarg),
    step: int
        step size for bins in the ACF calculation (50)
    threshold: foat
        Extend regions after seeding if pvalue exceeds this value (default: same as seed)
    seed: float
        minimum size of a genomic region (0.05)
    no_fdr: bool
        don't use FDR
    genome_control: bool
        correct input pvalues for genome control (inflation factor). This reduces the confounding effects
        of population stratitication in EWAS data.

Display/output Paramters:

    verbose: bool -- default False
        Display additional processing information on screen.
    plot: bool -- default False
        False will suppress the manhattan plot step.
    prefix: str
        prefix that gets appended to all output files (e.g. 'dmr' becomes 'dmr_regions.csv')
    region_filter_p:
        max adjusted region-level p-value in final output.
    region_filter_n:
        req at least this many probes in a region
    annotate: bool
        annotate with fetch_genes() function that uses UCSC refGene database to add "nearby" genes to
        differentially methylated regions in the output CSV. If you want to fine-tune the reference database,
        the tolerance of what "nearby" means, and other parameters, set this to false and call `methylize.fetch_genes`
        as a separate step on the '..._regions.csv' output file.
    tissue: str
        if specified, adds additional columns to the annotation output with the expression levels for identified genes
        in any/all tissue(s) that match the keyword. (e.g. if your methylation samples are whole blood,
        specify `tissue=blood`) For all 54 tissues, use `tissue=all`

Returns:

    list
        A list of files created.
    """
    kw = {
        'col_num': 3, # chrom | start | end | pvalue | name
        'step': None, # comb-p default is 50, but this will auto-calc
        'dist': 80,
        'seed': 0.05,
        'table': 'refGene',
    }
    # user override any defaults
    kw.update(kwargs)

    bed_filename = f"{kwargs.get('prefix','')}_dmp_stats"
    bed_file = to_BED(stats, manifest_or_array_type,
        save=True,
        filename=bed_filename,
        columns=['chrom' ,'start', 'end', 'pvalue', 'name'],
        genome_build=kw.get('genome_build', None)
        )

    kw['bed_files'] = [bed_file]
    if not kw.get('prefix'):
        kw['prefix'] = bed_file.split('.')[0]

    if isinstance(manifest_or_array_type, methylprep.Manifest):
        manifest = manifest_or_array_type
    elif isinstance(manifest_or_array_type, str) and manifest_or_array_type in methylprep.files.manifests.ARRAY_FILENAME.keys():
        manifest = methylprep.Manifest(methylprep.ArrayType(manifest_or_array_type))

    try:
        results = _pipeline(kw['col_num'], kw['step'], kw['dist'],
            kw.get('acf_dist', int(round(0.33333 * kw['dist'], -1))),
            kw.get('prefix',''), kw.get('threshold', kw['seed']),
            kw['seed'], kw['table'], kw['bed_files'],
            region_filter_p = kw.get('region_filter_p',1),
            region_filter_n = kw.get('region_filter_n'),
            genome_control=kw.get('genome_control', False), use_fdr=not kw.get('no_fdr',False),
            log_to_file=True, verbose=kw.get('verbose',False))
        if results["result"] != "OK":
            LOGGER.warning(results)
    except Exception as e:
        import traceback
        LOGGER.error(traceback.format_exc())
        LOGGER.error(e)
    # add probe names back into these files:
    files_created = [
        f"{kw.get('prefix','')}.args.txt",
        f"{kw.get('prefix','')}.acf.txt",
        f"{kw.get('prefix','')}.fdr.bed.gz",
        f"{kw.get('prefix','')}.slk.bed.gz",
        f"{kw.get('prefix','')}.regions.bed.gz",
        f"{kw.get('prefix','')}.regions-p.bed.gz",
    ]
    chromStart = manifest.data_frame['MAPINFO'].astype(float).reset_index() # _OLD or NEW?
    # add probe names and prepare one common stats-results CSV:
    stats_series = {}
    _deleted = None
    _no_regions = None
    _added = None
    for _file in files_created:
        # errors here are for when no regions are found, and file are blank.
        if 'regions.bed' in _file:
            regions_header = ['chrom','chromStart','chromEnd','min_p','n_probes']
            # regions-p includes all of regions; delete below.
            Path(_file).unlink()
            _deleted = _file
        elif 'regions-p.bed' in _file:
            if results["result"] == "No regions found":
                _no_regions = _file
                continue
            regions_p_header = ['chrom','chromStart','chromEnd','min_p','n_probes','z_p','z_sidak_p']
            regions_p_rename = {'#chrom':'chrom', 'start': 'chromStart', 'end': 'chromEnd'}
            try:
                df = pd.read_csv(_file, sep='\t').rename(columns=regions_p_rename)
                df = df.merge(chromStart, left_on='chromStart', right_on='MAPINFO', how='inner').drop(columns=['MAPINFO']).rename(columns={'IlmnID':'name'})
                regions_stats_file = f"{kw.get('prefix','')}_regions.csv"
                df.to_csv(regions_stats_file, index=False)
                _added = regions_stats_file
            except Exception as e:
                LOGGER.error(f"{_file}: {e}")
        elif '.bed' in Path(_file).suffixes:
            try:
                df = pd.read_csv(_file, sep='\t')
                # match using start position (MAPINFO)
                df = df.merge(chromStart, left_on='start', right_on='MAPINFO', how='inner').drop(columns=['MAPINFO']).rename(columns={'IlmnID':'name'})
                df.to_csv(_file, sep='\t', index=False)
                if 'fdr' in _file:
                    stats_series[_file] = df[['p', 'region-p','region-q','name']].set_index('name').rename(
                        columns={'p': 'fdr-p', 'region-p': 'region-p', 'region-q': 'fdr-region-q'}) # might PValue be FDR_QValue instead?
                if 'slk' in _file:
                    stats_series[_file] = df[['region-p','name']].set_index('name').rename(
                        columns={'region-p': 'slk-region-p'})
            except Exception as e:
                LOGGER.error(f"{_file}: {e}")
    files_created.remove(_deleted)
    files_created.append(_added)
    if _no_regions:
        files_created.remove(_no_regions)
    # switch from `Agg` to interactive; but diff OSes work with diff ones, and the best ones are not default installed.
    interactive_backends = ['TkAgg', 'Qt5Agg', 'MacOSX', 'ipympl', 'GTK3Agg', 'GTK3Cairo', 'nbAgg', 'Qt5Cairo','TkCairo']
    try:
        matplotlib.switch_backend(default_backend)
    except:
        if in_notebook():
            try:
                matplotlib.switch_backend('ipympl')
            except:
                pass
        for backend in interactive_backends:
            try:
                matplotlib.switch_backend(backend)
            except:
                continue

    if kw.get('plot') == True:
        try:
            manhattan_cols = {'region-p':'PValue', 'region-q':'FDR_QValue', '#chrom':'chromosome', 'start': 'MAPINFO'}
            _fdr_ = pd.read_csv(kw['prefix'] + '.fdr.bed.gz', sep='\t').rename(columns=manhattan_cols).set_index('name')
            manhattan_plot(_fdr_, manifest)
        except Exception as e:
            if kw.get('verbose',False) == True:
                LOGGER.error("Could not produce the manhattan plot: {e}")

    if stats_series != {}:
        # problem: manifest is unique but FDR/SLK, so need to look up
        chr_start_end = manifest_gene_map(manifest, genome_build=kw.get('genome_build', None))
        chr_start_end.index.name = 'name'
        probe_index = list(stats_series.values())[0].index
        #--- cannot DO: stats_series['chrom'] = chr_start_end[ chr_start_end.index.isin(probe_index) ]
        # _file = f"{kw.get('prefix','')}.fdr.bed.gz"
        # concat fails because FDR/SLK probes can repeat in index. must merge first.
        try:
            stats_df = pd.concat(list(stats_series.values()), axis=1)
            stats_df = stats_df.merge(chr_start_end, left_index=True, right_index=True)
            stats_file = f"{kw.get('prefix','')}_stats.csv"
            stats_df.to_csv(stats_file)
            files_created.append(stats_file)
        except Exception as e:
            LOGGER.error(f'Could not include chrom | chromStart | chromEnd in stats file: {e} (some probes appear in multiple results rows)')

    # cruzdb is python2x only, and hasn't been maintained since 2014. So we wrote our own UCSC interface function: fetch_genes
    regions_stats_file = Path(f"{kw.get('prefix','')}_regions.csv")
    if kw.get('annotate',True) == True and regions_stats_file.exists():
        if manifest.array_type in ['mouse']:
            LOGGER.warning(f"Genome annotation is not supported for {manifest.array_type} array_type.")
        elif kw.get('genome_build', None) == 'OLD':
            LOGGER.warning(f"Genome annotation is not supported for OLD genome builds. Only the latest (hg38) build is supported.")
        else:
            final_results = fetch_genes(regions_stats_file, tissue=kw.get('tissue',None))
            files_created.append(f"{kw.get('prefix','')}_regions_genes.csv")
    elif kw.get('annotate',True) == True:
        LOGGER.error(f"Could not annotate; no regions.csv file found")
    files_created.append(bed_file)
    return files_created


def _pipeline(col_num0, step, dist, acf_dist, prefix, threshold, seed, table,
        bed_files, mlog=True, region_filter_p=1, region_filter_n=None,
        genome_control=False, use_fdr=True, log_to_file=True, verbose=False):
    """Internal pipeline: adapted from `combined-pvalues` (cpv) pipeline to work outside of a CLI."""
    # a hack to ensure local files can be imported
    # sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
    #from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter
    #from cpv._common import genome_control_adjust, genomic_control, bediter
    import operator


    if step is None:
        step = min(acf_dist, methylize.cpv.stepsize(bed_files, col_num0))
        if verbose: LOGGER.info(f"calculated stepsize as: {step}")

    lags = list(range(1, acf_dist, step))
    lags.append(lags[-1] + step)

    prefix = prefix.rstrip(".")

    # ACF: auto-correlation
    putative_acf_vals = methylize.cpv.acf(bed_files, lags, col_num0, simple=False,
                                mlog=mlog)
    acf_vals = []
    # go out to max requested distance but stop once an autocorrelation
    # < 0.05 is added.
    for a in putative_acf_vals:
        # a is ((lmin, lmax), (corr, N))
        # this heuristic seems to work. stop just above the 0.08 correlation
        # lag.
        if a[1][0] < 0.04 and len(acf_vals) > 2: break
        acf_vals.append(a)
        if a[1][0] < 0.04 and len(acf_vals): break

    if log_to_file:
        # save the arguments that this was called with.
        with open(prefix + ".args.txt", "w") as fh:
            sys_args = " ".join(sys.argv[1:])
            print(sys_args + "\n", file=fh) #### <<<--- only catch command line args
            import datetime
            timestamp = datetime.datetime.today()
            print("date: %s" % timestamp, file=fh)
            from .__init__ import __version__
            print("version:", __version__, file=fh)
            if verbose: LOGGER.info(f"{sys_args} | {timestamp} | {__version__}")

        with open(prefix + ".acf.txt", "w") as fh:
            acf_vals = methylize.cpv.write_acf(acf_vals, fh)
            print("wrote: %s" % fh.name, file=fh)
        if verbose: LOGGER.info(f"ACF: {acf_vals}")
    else:
        if verbose: LOGGER.info(f"ACF: {acf_vals}")

    spvals, opvals = array.array('f'), array.array('f')
    with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk:
        fhslk.write('#chrom\tstart\tend\tp\tregion-p\n')
        for chrom, results in methylize.cpv.slk.adjust_pvals(bed_files, col_num0, acf_vals):
            fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n"
            for row in results:
                row = tuple(row)
                fhslk.write(fmt % row)
                opvals.append(row[-2])
                spvals.append(row[-1])

    if verbose: LOGGER.info(f"Original lambda (genomic control): {methylize.cpv.genomic_control(opvals)}")
    del opvals

    gc_lambda = methylize.cpv.genomic_control(spvals)
    if verbose: LOGGER.info(f"wrote: {fhslk.name} with lambda: {gc_lambda}")

    if genome_control:
        # adjust p-values by the genomic inflance control factor, lambda
        # see https://en.wikipedia.org/wiki/Genomic_control
        # or https://onlinelibrary.wiley.com/doi/abs/10.1111/j.0006-341X.1999.00997.x for explanation
        adj = methylize.cpv.genome_control_adjust([d['p'] for d in methylize.cpv.bediter(prefix + ".slk.bed.gz", -1)])
        slk_df = pd.read_csv(prefix + ".slk.bed.gz", sep='\t')
        slk_df['original_p'] = slk_df['p']
        slk_df['p'] = adj
        slk_df.to_csv(prefix + ".slk.bed.gz", sep='\t', index=False)

    with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh:
        fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n')
        for bh, l in methylize.cpv.fdr(fhslk.name, -1):
            fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh))
        if verbose: LOGGER.info(f"wrote: {fh.name}")
    fregions = prefix + ".regions.bed.gz"
    with ts.nopen(fregions, "w") as fh:
        list(methylize.cpv.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed,
            dist, fh, operator.le))
    n_regions = sum(1 for _ in ts.nopen(fregions))
    if verbose: LOGGER.info(f"wrote: {fregions} ({n_regions} regions)")
    if n_regions == 0:
        return {"result": "No regions found"}

    # HACK -- edit pvalues and region-p to be >0.00000
    # this prevents a bunch of "divide by zero" warnings
    temp = pd.read_csv(prefix + ".slk.bed.gz", sep='\t')
    temp['p'] = temp['p'].apply(lambda x: 0.0000001 if x == 0 else x)
    temp['region-p'] = temp['region-p'].apply(lambda x: 0.0000001 if x == 0 else x)
    temp.to_csv(prefix + ".slk.bed.gz", sep='\t', index=False)

    with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh:
        N = 0
        fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n")
        # use -2 for original, uncorrected p-values in slk.bed
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            for region_line, slk_p, slk_sidak_p, sim_p in methylize.cpv.region_p(
                                   prefix + ".slk.bed.gz",
                                   prefix + ".regions.bed.gz", -2,
                                   step):
                fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p))
                fh.flush()
                N += int(slk_sidak_p < 0.05)
            LOGGER.info(f"wrote: {fh.name}, (regions with corrected-p < 0.05: {N})")

    regions_bed = fh.name
    #if all(h in header for h in ('t', 'start', 'end')):
    if region_filter_n is None:
        region_filter_n = 0
    if not Path(f"{prefix}.regions-p.bed.gz").exists():
        return {"result": "No clustered CpG regions found (that differ between your sample groups)."}

    """combined-pvalues: NEXT function filter.filter() requires bedtools installed, and only works on macos/linux.
    -- combines regions from multiple datasets, I think.
    with ts.nopen(prefix + ".regions-t.bed", "w") as fh:
        N = 0
        for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num0)):
            if i == 0: toks[0] = "#" + toks[0]
            else:
                if float(toks[6]) > region_filter_p: continue
                if int(toks[4]) < region_filter_n: continue
                #if region_ and "/" in toks[7]:
                #    # t-pos/t-neg. if the lower one is > region_?
                #    vals = map(int, toks[7].split("/"))
                #    if min(vals) > region_: continue

                N += 1
            print("\t".join(toks), file=fh)
        print(("wrote: %s, (regions with region-p "
                            "< %.3f and n-probes >= %i: %i)") \
                % (fh.name, region_filter_p, region_filter_n, N),
                file=sys.stderr)
    """
    #if verbose:
    #    regions = methylize.cpv.read_regions(fh.name)
    #    methylize.cpv.manhattan(prefix + ".slk.bed.gz", 3,
    #        prefix.rstrip(".") + ".manhattan.png",
    #        False, ['#959899', '#484B4C'], "", False, None,
    #        regions=regions, bonferonni=False)
    return {"result": "OK"}



""" NOT USED
def qqplot(lpys, ax_qq):
    lunif = -np.log10(np.arange(1, len(lpys) + 1) / float(len(lpys)))[::-1]
    ax_qq.plot(lunif, np.sort(lpys), marker=',', linestyle='none', c='#EA352B')
    ax_qq.set_xticks([])
    ax_qq.plot(lunif, lunif, ls='--', c='#959899')
    ax_qq.set_xlabel('')
    ax_qq.set_ylabel('')
    ax_qq.set_yticks([])
    ax_qq.axis('tight')
    ax_qq.axes.set_frame_on(True)
"""