Source code for methylize.diff_meth_regions

import logging
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.multitest import multipletests
#from scipy import stats --- conflicts with input param names
#from joblib import Parallel, delayed, cpu_count
import matplotlib # color maps and changing Agg backend after cpv alters it.
default_backend = matplotlib.get_backend()
import matplotlib.pyplot as plt
import methylprep
import numpy as np
# from cpv.pipeline import pipeline -- copied and modified here
#from methylize.cpv._common import bediter, genomic_control
import methylize
from pathlib import Path
from .progress_bar import * # tqdm, in_notebook

# cpv imports
import sys
import os
import array
from itertools import groupby, cycle
from operator import itemgetter
#import matplotlib
#matplotlib.use('Agg') --- this prevents plot.show(); only for processing, low overhead
import seaborn as sns
sns.set_context("paper")
sns.set_style("dark", {'axes.linewidth': 1})
from functools import cmp_to_key
try:
    cmp
except NameError:
    def cmp(a, b):
        return (a > b) - (a < b)
import toolshed as ts

# app
from .helpers import color_schemes, to_BED, manifest_gene_map
from .diff_meth_pos import manhattan_plot
from .genome_browser import fetch_genes

LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.INFO)

np.seterr(under='ignore')

__all__ = ['diff_meth_regions']

[docs]def diff_meth_regions(stats, manifest_or_array_type, **kwargs): """Calculates and annotates diffentially methylated regions (DMR) using the `combined-pvalues pipeline` and returns list of output files. comb-p is a command-line tool and a python library that manipulates BED files of possibly irregularly spaced P-values and (1) calculates auto-correlation, (2) combines adjacent P-values, (3) performs false discovery adjustment, (4) finds regions of enrichment (i.e. series of adjacent low P-values) and (5) assigns significance to those regions. ref: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3496335/ Input Parameters: stats: dataframe dataframe output from diff_meth_pos() manifest_or_array_type: class instance or string pass in the manifest, or the name of the array filename: filename to prepend to the .BED file created. creates a <filename>.bed output from diff_meth_pos and to_BED. genome_build: default 'NEW' by default, it uses the NEWer genome build. Each manifest contains two genome builds, marked "NEW" and "OLD". To use the OLD build, se this to "OLD". Computational Parameters: dist: int maximum distance from each probe to scan for adjacent peaks (80) acf-dist: int window-size for smoothing. Default is 1/3 of peak-dist (dist kwarg), step: int step size for bins in the ACF calculation (50) threshold: foat Extend regions after seeding if pvalue exceeds this value (default: same as seed) seed: float minimum size of a genomic region (0.05) no_fdr: bool don't use FDR genome_control: bool correct input pvalues for genome control (inflation factor). This reduces the confounding effects of population stratitication in EWAS data. Display/output Paramters: verbose: bool -- default False Display additional processing information on screen. plot: bool -- default False False will suppress the manhattan plot step. prefix: str prefix that gets appended to all output files (e.g. 'dmr' becomes 'dmr_regions.csv') region_filter_p: max adjusted region-level p-value in final output. region_filter_n: req at least this many probes in a region annotate: bool annotate with fetch_genes() function that uses UCSC refGene database to add "nearby" genes to differentially methylated regions in the output CSV. If you want to fine-tune the reference database, the tolerance of what "nearby" means, and other parameters, set this to false and call `methylize.fetch_genes` as a separate step on the '..._regions.csv' output file. tissue: str if specified, adds additional columns to the annotation output with the expression levels for identified genes in any/all tissue(s) that match the keyword. (e.g. if your methylation samples are whole blood, specify `tissue=blood`) For all 54 tissues, use `tissue=all` Returns: list A list of files created. """ kw = { 'col_num': 3, # chrom | start | end | pvalue | name 'step': None, # comb-p default is 50, but this will auto-calc 'dist': 80, 'seed': 0.05, 'table': 'refGene', } # user override any defaults kw.update(kwargs) bed_filename = f"{kwargs.get('prefix','')}_dmp_stats" bed_file = to_BED(stats, manifest_or_array_type, save=True, filename=bed_filename, columns=['chrom' ,'start', 'end', 'pvalue', 'name'], genome_build=kw.get('genome_build', None) ) kw['bed_files'] = [bed_file] if not kw.get('prefix'): kw['prefix'] = bed_file.split('.')[0] if isinstance(manifest_or_array_type, methylprep.Manifest): manifest = manifest_or_array_type elif isinstance(manifest_or_array_type, str) and manifest_or_array_type in methylprep.files.manifests.ARRAY_FILENAME.keys(): manifest = methylprep.Manifest(methylprep.ArrayType(manifest_or_array_type)) try: results = _pipeline(kw['col_num'], kw['step'], kw['dist'], kw.get('acf_dist', int(round(0.33333 * kw['dist'], -1))), kw.get('prefix',''), kw.get('threshold', kw['seed']), kw['seed'], kw['table'], kw['bed_files'], region_filter_p = kw.get('region_filter_p',1), region_filter_n = kw.get('region_filter_n'), genome_control=kw.get('genome_control', False), use_fdr=not kw.get('no_fdr',False), log_to_file=True, verbose=kw.get('verbose',False)) if results["result"] != "OK": LOGGER.warning(results) except Exception as e: import traceback LOGGER.error(traceback.format_exc()) LOGGER.error(e) # add probe names back into these files: files_created = [ f"{kw.get('prefix','')}.args.txt", f"{kw.get('prefix','')}.acf.txt", f"{kw.get('prefix','')}.fdr.bed.gz", f"{kw.get('prefix','')}.slk.bed.gz", f"{kw.get('prefix','')}.regions.bed.gz", f"{kw.get('prefix','')}.regions-p.bed.gz", ] chromStart = manifest.data_frame['MAPINFO'].astype(float).reset_index() # _OLD or NEW? # add probe names and prepare one common stats-results CSV: stats_series = {} _deleted = None _no_regions = None _added = None for _file in files_created: # errors here are for when no regions are found, and file are blank. if 'regions.bed' in _file: regions_header = ['chrom','chromStart','chromEnd','min_p','n_probes'] # regions-p includes all of regions; delete below. Path(_file).unlink() _deleted = _file elif 'regions-p.bed' in _file: if results["result"] == "No regions found": _no_regions = _file continue regions_p_header = ['chrom','chromStart','chromEnd','min_p','n_probes','z_p','z_sidak_p'] regions_p_rename = {'#chrom':'chrom', 'start': 'chromStart', 'end': 'chromEnd'} try: df = pd.read_csv(_file, sep='\t').rename(columns=regions_p_rename) df = df.merge(chromStart, left_on='chromStart', right_on='MAPINFO', how='inner').drop(columns=['MAPINFO']).rename(columns={'IlmnID':'name'}) regions_stats_file = f"{kw.get('prefix','')}_regions.csv" df.to_csv(regions_stats_file, index=False) _added = regions_stats_file except Exception as e: LOGGER.error(f"{_file}: {e}") elif '.bed' in Path(_file).suffixes: try: df = pd.read_csv(_file, sep='\t') # match using start position (MAPINFO) df = df.merge(chromStart, left_on='start', right_on='MAPINFO', how='inner').drop(columns=['MAPINFO']).rename(columns={'IlmnID':'name'}) df.to_csv(_file, sep='\t', index=False) if 'fdr' in _file: stats_series[_file] = df[['p', 'region-p','region-q','name']].set_index('name').rename( columns={'p': 'fdr-p', 'region-p': 'region-p', 'region-q': 'fdr-region-q'}) # might PValue be FDR_QValue instead? if 'slk' in _file: stats_series[_file] = df[['region-p','name']].set_index('name').rename( columns={'region-p': 'slk-region-p'}) except Exception as e: LOGGER.error(f"{_file}: {e}") files_created.remove(_deleted) files_created.append(_added) if _no_regions: files_created.remove(_no_regions) # switch from `Agg` to interactive; but diff OSes work with diff ones, and the best ones are not default installed. interactive_backends = ['TkAgg', 'Qt5Agg', 'MacOSX', 'ipympl', 'GTK3Agg', 'GTK3Cairo', 'nbAgg', 'Qt5Cairo','TkCairo'] try: matplotlib.switch_backend(default_backend) except: if in_notebook(): try: matplotlib.switch_backend('ipympl') except: pass for backend in interactive_backends: try: matplotlib.switch_backend(backend) except: continue if kw.get('plot') == True: try: manhattan_cols = {'region-p':'PValue', 'region-q':'FDR_QValue', '#chrom':'chromosome', 'start': 'MAPINFO'} _fdr_ = pd.read_csv(kw['prefix'] + '.fdr.bed.gz', sep='\t').rename(columns=manhattan_cols).set_index('name') manhattan_plot(_fdr_, manifest) except Exception as e: if kw.get('verbose',False) == True: LOGGER.error("Could not produce the manhattan plot: {e}") if stats_series != {}: # problem: manifest is unique but FDR/SLK, so need to look up chr_start_end = manifest_gene_map(manifest, genome_build=kw.get('genome_build', None)) chr_start_end.index.name = 'name' probe_index = list(stats_series.values())[0].index #--- cannot DO: stats_series['chrom'] = chr_start_end[ chr_start_end.index.isin(probe_index) ] # _file = f"{kw.get('prefix','')}.fdr.bed.gz" # concat fails because FDR/SLK probes can repeat in index. must merge first. try: stats_df = pd.concat(list(stats_series.values()), axis=1) stats_df = stats_df.merge(chr_start_end, left_index=True, right_index=True) stats_file = f"{kw.get('prefix','')}_stats.csv" stats_df.to_csv(stats_file) files_created.append(stats_file) except Exception as e: LOGGER.error(f'Could not include chrom | chromStart | chromEnd in stats file: {e} (some probes appear in multiple results rows)') # cruzdb is python2x only, and hasn't been maintained since 2014. So we wrote our own UCSC interface function: fetch_genes regions_stats_file = Path(f"{kw.get('prefix','')}_regions.csv") if kw.get('annotate',True) == True and regions_stats_file.exists(): if manifest.array_type in ['mouse']: LOGGER.warning(f"Genome annotation is not supported for {manifest.array_type} array_type.") elif kw.get('genome_build', None) == 'OLD': LOGGER.warning(f"Genome annotation is not supported for OLD genome builds. Only the latest (hg38) build is supported.") else: final_results = fetch_genes(regions_stats_file, tissue=kw.get('tissue',None)) files_created.append(f"{kw.get('prefix','')}_regions_genes.csv") elif kw.get('annotate',True) == True: LOGGER.error(f"Could not annotate; no regions.csv file found") files_created.append(bed_file) return files_created
def _pipeline(col_num0, step, dist, acf_dist, prefix, threshold, seed, table, bed_files, mlog=True, region_filter_p=1, region_filter_n=None, genome_control=False, use_fdr=True, log_to_file=True, verbose=False): """Internal pipeline: adapted from `combined-pvalues` (cpv) pipeline to work outside of a CLI.""" # a hack to ensure local files can be imported # sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) #from cpv import acf, slk, fdr, peaks, region_p, stepsize, filter #from cpv._common import genome_control_adjust, genomic_control, bediter import operator if step is None: step = min(acf_dist, methylize.cpv.stepsize(bed_files, col_num0)) if verbose: LOGGER.info(f"calculated stepsize as: {step}") lags = list(range(1, acf_dist, step)) lags.append(lags[-1] + step) prefix = prefix.rstrip(".") # ACF: auto-correlation putative_acf_vals = methylize.cpv.acf(bed_files, lags, col_num0, simple=False, mlog=mlog) acf_vals = [] # go out to max requested distance but stop once an autocorrelation # < 0.05 is added. for a in putative_acf_vals: # a is ((lmin, lmax), (corr, N)) # this heuristic seems to work. stop just above the 0.08 correlation # lag. if a[1][0] < 0.04 and len(acf_vals) > 2: break acf_vals.append(a) if a[1][0] < 0.04 and len(acf_vals): break if log_to_file: # save the arguments that this was called with. with open(prefix + ".args.txt", "w") as fh: sys_args = " ".join(sys.argv[1:]) print(sys_args + "\n", file=fh) #### <<<--- only catch command line args import datetime timestamp = datetime.datetime.today() print("date: %s" % timestamp, file=fh) from .__init__ import __version__ print("version:", __version__, file=fh) if verbose: LOGGER.info(f"{sys_args} | {timestamp} | {__version__}") with open(prefix + ".acf.txt", "w") as fh: acf_vals = methylize.cpv.write_acf(acf_vals, fh) print("wrote: %s" % fh.name, file=fh) if verbose: LOGGER.info(f"ACF: {acf_vals}") else: if verbose: LOGGER.info(f"ACF: {acf_vals}") spvals, opvals = array.array('f'), array.array('f') with ts.nopen(prefix + ".slk.bed.gz", "w") as fhslk: fhslk.write('#chrom\tstart\tend\tp\tregion-p\n') for chrom, results in methylize.cpv.slk.adjust_pvals(bed_files, col_num0, acf_vals): fmt = chrom + "\t%i\t%i\t%.4g\t%.4g\n" for row in results: row = tuple(row) fhslk.write(fmt % row) opvals.append(row[-2]) spvals.append(row[-1]) if verbose: LOGGER.info(f"Original lambda (genomic control): {methylize.cpv.genomic_control(opvals)}") del opvals gc_lambda = methylize.cpv.genomic_control(spvals) if verbose: LOGGER.info(f"wrote: {fhslk.name} with lambda: {gc_lambda}") if genome_control: # adjust p-values by the genomic inflance control factor, lambda # see https://en.wikipedia.org/wiki/Genomic_control # or https://onlinelibrary.wiley.com/doi/abs/10.1111/j.0006-341X.1999.00997.x for explanation adj = methylize.cpv.genome_control_adjust([d['p'] for d in methylize.cpv.bediter(prefix + ".slk.bed.gz", -1)]) slk_df = pd.read_csv(prefix + ".slk.bed.gz", sep='\t') slk_df['original_p'] = slk_df['p'] slk_df['p'] = adj slk_df.to_csv(prefix + ".slk.bed.gz", sep='\t', index=False) with ts.nopen(prefix + ".fdr.bed.gz", "w") as fh: fh.write('#chrom\tstart\tend\tp\tregion-p\tregion-q\n') for bh, l in methylize.cpv.fdr(fhslk.name, -1): fh.write("%s\t%.4g\n" % (l.rstrip("\r\n"), bh)) if verbose: LOGGER.info(f"wrote: {fh.name}") fregions = prefix + ".regions.bed.gz" with ts.nopen(fregions, "w") as fh: list(methylize.cpv.peaks(prefix + ".fdr.bed.gz", -1 if use_fdr else -2, threshold, seed, dist, fh, operator.le)) n_regions = sum(1 for _ in ts.nopen(fregions)) if verbose: LOGGER.info(f"wrote: {fregions} ({n_regions} regions)") if n_regions == 0: return {"result": "No regions found"} # HACK -- edit pvalues and region-p to be >0.00000 # this prevents a bunch of "divide by zero" warnings temp = pd.read_csv(prefix + ".slk.bed.gz", sep='\t') temp['p'] = temp['p'].apply(lambda x: 0.0000001 if x == 0 else x) temp['region-p'] = temp['region-p'].apply(lambda x: 0.0000001 if x == 0 else x) temp.to_csv(prefix + ".slk.bed.gz", sep='\t', index=False) with ts.nopen(prefix + ".regions-p.bed.gz", "w") as fh: N = 0 fh.write("#chrom\tstart\tend\tmin_p\tn_probes\tz_p\tz_sidak_p\n") # use -2 for original, uncorrected p-values in slk.bed import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") for region_line, slk_p, slk_sidak_p, sim_p in methylize.cpv.region_p( prefix + ".slk.bed.gz", prefix + ".regions.bed.gz", -2, step): fh.write("%s\t%.4g\t%.4g\n" % (region_line, slk_p, slk_sidak_p)) fh.flush() N += int(slk_sidak_p < 0.05) LOGGER.info(f"wrote: {fh.name}, (regions with corrected-p < 0.05: {N})") regions_bed = fh.name #if all(h in header for h in ('t', 'start', 'end')): if region_filter_n is None: region_filter_n = 0 if not Path(f"{prefix}.regions-p.bed.gz").exists(): return {"result": "No clustered CpG regions found (that differ between your sample groups)."} """combined-pvalues: NEXT function filter.filter() requires bedtools installed, and only works on macos/linux. -- combines regions from multiple datasets, I think. with ts.nopen(prefix + ".regions-t.bed", "w") as fh: N = 0 for i, toks in enumerate(filter.filter(bed_files[0], regions_bed, p_col_name=col_num0)): if i == 0: toks[0] = "#" + toks[0] else: if float(toks[6]) > region_filter_p: continue if int(toks[4]) < region_filter_n: continue #if region_ and "/" in toks[7]: # # t-pos/t-neg. if the lower one is > region_? # vals = map(int, toks[7].split("/")) # if min(vals) > region_: continue N += 1 print("\t".join(toks), file=fh) print(("wrote: %s, (regions with region-p " "< %.3f and n-probes >= %i: %i)") \ % (fh.name, region_filter_p, region_filter_n, N), file=sys.stderr) """ #if verbose: # regions = methylize.cpv.read_regions(fh.name) # methylize.cpv.manhattan(prefix + ".slk.bed.gz", 3, # prefix.rstrip(".") + ".manhattan.png", # False, ['#959899', '#484B4C'], "", False, None, # regions=regions, bonferonni=False) return {"result": "OK"} """ NOT USED def qqplot(lpys, ax_qq): lunif = -np.log10(np.arange(1, len(lpys) + 1) / float(len(lpys)))[::-1] ax_qq.plot(lunif, np.sort(lpys), marker=',', linestyle='none', c='#EA352B') ax_qq.set_xticks([]) ax_qq.plot(lunif, lunif, ls='--', c='#959899') ax_qq.set_xlabel('') ax_qq.set_ylabel('') ax_qq.set_yticks([]) ax_qq.axis('tight') ax_qq.axes.set_frame_on(True) """