from datetime import date
from pycltools.pycltools import *
from collections import Counter
from pprint import pprint as pp
from matplotlib import pyplot as pl
import pandas as pd
import numpy as np
from random import choice, sample
import pandas as pd
from tqdm import tqdm, trange
import sys
import multiprocessing as mp
# Pandas tweaking
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 20
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
pl.rcParams['figure.figsize'] = 30, 5
pl.rcParams['font.family'] = 'sans-serif'
pl.rcParams['font.sans-serif'] = ['DejaVu Sans']
pl.style.use('ggplot')
jprint("Adrien Leger / EMBL EBI", bold=True, size=150)
jprint("Starting date : 2017-12-12", bold=True, italic=True, size=125)
jprint("Last modification date : {}-{}-{}".format(date.today().year, date.today().month, date.today().day), bold=True, italic=True, size=125)
bases = ["A","T","C","G","X"]
tries = 50
kmer_len_min = 4
kmer_len_max = 9
df = pd.DataFrame(columns = range (kmer_len_min, kmer_len_max+1) , index=range(tries))
for kmer_len in range (kmer_len_min, kmer_len_max+1):
possibilities = len(bases)**kmer_len
print (f"Kmer length {kmer_len} = {possibilities} possible kmers")
for t in range(tries):
kmers_counter = set()
kmer = "".join([choice(bases) for _ in range(kmer_len)])
n_base = 0
while len(kmers_counter) < possibilities:
new_base = choice(bases)
kmer = kmer[1:]+new_base
kmers_counter.add(kmer)
n_base+=1
df.loc[t, kmer_len] = n_base
df
fig, ax = pl.subplots(figsize=(10,10))
_ = df.boxplot(ax=ax)
_ = ax.set_yscale("log")
_ = ax.set_xlabel ("kmer length")
_ = ax.set_ylabel ("Sequence length")
_ = ax.set_title ("Length of sequence to generate to cover all kmers")
No homopolymers longer than the 6
Remove oligo with high secondary struct with RNAfold
Generate 50 times more than needed
Sample to get the best kmer diversity with the 20000 oligos in total
# Define options
bases = ["A","T","C","G"]
n_seq_total = 1000000
seq_len = 50
kmer_len = 7
max_homopolymer_size = 5
out_fasta = "./sequence_pool.seq"
homopolymer_5_base = "C"
homopolymer_5_len = 6
homopolymer_3_base = "A"
homopolymer_3_len = 11
homopolymer_5 = homopolymer_5_base*homopolymer_5_len
homopolymer_3 = homopolymer_3_base*homopolymer_3_len
# Init values
seq_list = []
kmer_counter = Counter()
possible_kmers = len(bases)**kmer_len
stdout_print (f"Kmer length {kmer_len}\n{possible_kmers} possible kmers\n")
# Create sequence iteratively
with open (out_fasta, "w") as out_fp:
for n_seq in range(n_seq_total):
seq = ""
previous_base = ""
previous_base_count = 0
for pos in np.arange(seq_len):
bases_choice = bases
# Prevent long homopolymers
if previous_base_count >= max_homopolymer_size:
bases_choice = [b for b in bases_choice if b != previous_base]
previous_base_count = 0
# # 3 prime exception = different from the leading homopolymer
if pos == 0:
bases_choice = [b for b in bases_choice if b != homopolymer_5_base]
# 3 prime exception = different from the trailling homopolymer
if pos == seq_len-1:
bases_choice = [b for b in bases_choice if b != homopolymer_3_base]
# ramdomly select one of the possible base
base = choice(bases_choice)
# Extend the sequence
seq += base
previous_base = base
previous_base_count += 1
# Append sequence to the list and count the diferent kmers in the sequence
for k_start, k_end in zip (np.arange(0, seq_len-kmer_len+1), np.arange(kmer_len, seq_len+1)):
kmer = seq[k_start:k_end]
kmer_counter [kmer] +=1
# Add adapters and write in file
out_fp.write (f"{homopolymer_5}{seq}{homopolymer_3}\n")
stdout_print (f"{len(kmer_counter)} kmers generated")
s = pd.Series(kmer_counter, name="count")
s.sort_values(ascending=False, inplace=True)
s.to_frame()
infile = "./sequence_pool.seq"
outfile = "./sequence_pool_folded.txt"
queue = "highpri"
thread = 3
mem = 5000
cmd = f"bsub -n {thread} -M {mem} -R 'rusage[mem={mem}]' -q {queue} -oo {outfile} -N RNAfold -i {infile}"
bash (virtualenv="ViennaRNA_2.4.3", cmd=cmd, live="stderr")
seq_dict = {}
with open ("./sequence_pool_folded.txt") as fp:
score = 0
seq = ""
for line in fp:
if not seq:
seq = line.strip()
else:
score = line.rpartition(")")[0].rpartition("(")[2].strip()
if score:
seq_dict[seq] = float(score)
seq=""
s = pd.Series(seq_dict, name="MFE")
display(s.head().to_frame())
display(s.describe(percentiles=[0.05,0.10, 0.25, 0.5, 0.75, 0.90, 0.95]).to_frame())
cutoff = -17.6
kmer_counter = Counter()
start_offset = 6
end_offset = 12
kmer_len = 7
valid_seq_list = []
for seq, score in tqdm(seq_dict.items()):
if score > cutoff:
valid_seq_list.append(seq)
for k_start in np.arange(start_offset, len(seq)-kmer_len-end_offset+2):
k_end = k_start+kmer_len
kmer = seq[k_start:k_end]
kmer_counter [kmer] +=1
stdout_print (f"Number of valid sequences: {len(valid_seq_list)}\n")
stdout_print (f"{len(kmer_counter)} kmers found\n")
s = pd.Series(kmer_counter, name="count")
s.sort_values(ascending=False, inplace=True)
display(s.to_frame())
outfile = "valid_seq_list.txt"
with open(outfile, "w") as fh:
for seq in valid_seq_list:
fh.write(f"{seq}\n")
head(outfile, 5)
We want as many kmers as possible = with the more homogeneous distribution.
infile = "valid_seq_list.txt"
seq_list=[]
with open(infile, "r") as fh:
for seq in fh:
seq_list.append (seq.rstrip())
n_seq = len(seq_list)
nseq_per_bundle = 20000
n_bundles = 100
start_offset = 6
end_offset = 12
kmer_len = 7
seq_bundle_list = []
for i in trange(n_bundles):
seq_id_list = np.random.randint(0, n_seq, size=nseq_per_bundle)
kmer_counter = Counter()
for seq_id in seq_id_list:
seq = seq_list[seq_id]
for k_start in np.arange(start_offset, len(seq)-kmer_len-end_offset+2):
k_end = k_start+kmer_len
kmer = seq[k_start:k_end]
kmer_counter [kmer] +=1
n_kmers = len (kmer_counter)
counts = list(kmer_counter.values())
min_count = np.min (counts)
sd_count = np.std (counts)
max_count = np.max (counts)
diff = max_count-min_count
seq_bundle_list.append({"seq_id_list":seq_id_list,"n_kmers":n_kmers,"sd_count":sd_count, "min":min_count, "max":max_count, "diff":diff})
df = pd.DataFrame(seq_bundle_list)
df.sort_values(["min"], inplace=True, ascending=False)
df
Works but too slow to scale up
def kmer_per_seq_bundle (seq_file, nseq_per_bundle, n_bundles, start_offset, end_offset, kmer_len, min_score, threads):
"""
Multithreaded function to speed up the analysis
"""
stdout_print ("Parse sequence file\n")
seq_list=[]
with open(seq_file, "r") as fh:
for seq in fh:
seq_list.append (seq.rstrip())
n_seq = len(seq_list)
stdout_print ("Simulating sequence combination\n")
# Init Multiprocessing variables
with mp.Manager() as manager:
seq_bundle_q = manager.Queue()
kmer_stat_q = manager.Queue()
counter_q = manager.Queue()
# Define sampling process
make_bundle_args = (n_seq, nseq_per_bundle, n_bundles, seq_bundle_q, threads)
make_bundle_ps = mp.Process (target=make_bundle_w, args=make_bundle_args)
# Define kmer count process
kmer_stat_args = (seq_list, start_offset, end_offset, kmer_len, seq_bundle_q, kmer_stat_q, counter_q, min_score)
kmer_stat_psl = [mp.Process (target=kmer_stat_w, args=kmer_stat_args) for _ in range (threads)]
# Define kmer count process
counter_args = (counter_q, n_bundles, threads)
counter_ps = mp.Process (target=counter_w, args=counter_args)
# Start processes
make_bundle_ps.start()
for p in kmer_stat_psl: p.start()
counter_ps.start()
# Wait for processes to finish
make_bundle_ps.join()
for p in kmer_stat_psl: p.join()
counter_ps.join()
stdout_print ("Unpacking queue to list\n")
l = queue_to_list (q=kmer_stat_q, total=n_bundles, n_poison_pills=threads)
stdout_print ("Transform results to dataframe\n")
df = pd.DataFrame(l)
df.sort_values(["n_kmers", "min", "max"], inplace=True, ascending=[False, False, True])
return df
def make_bundle_w (n_seq, nseq_per_bundle, n_bundles, seq_bundle_q, threads):
""""""
# Create seq_index_list and enqueue
for entry in np.arange(n_bundles):
a = list(np.random.randint(0, n_seq, size=nseq_per_bundle))
seq_bundle_q.put (a)
# Add 1 poison pill per worker thread
for i in range (threads):
seq_bundle_q.put (None)
def kmer_stat_w (seq_list, start_offset, end_offset, kmer_len, seq_bundle_q, kmer_stat_q, counter_q, min_score):
""""""
# Consume the seq_bundle queue
for seq_index_list in iter (seq_bundle_q.get, None):
kmer_counter = Counter()
# Iterate over sequences of the index list
for seq_index in seq_index_list:
seq = seq_list[seq_index]
# Count kmers in the sequence
for k_start in np.arange(start_offset, len(seq)-kmer_len-end_offset+2):
kmer = seq[k_start:k_start+kmer_len]
kmer_counter [kmer] += 1
# Compute metrics
n_kmers = len (kmer_counter)
counts = list(kmer_counter.values())
sd_count = np.std (counts)
min_count = np.min (counts)
max_count = np.max (counts)
# Add to queue only of min score is high enough
if min_count >= min_score:
kmer_stat_q.put({"seq_id_list":seq_index_list,"n_kmers":n_kmers,"sd_count":sd_count, "min":min_count, "max":max_count})
counter_q.put(1)
kmer_stat_q.put (None)
counter_q.put (None)
def counter_w (counter_q, n_bundles, threads):
with tqdm (total=n_bundles, mininterval=0.5) as pbar:
for _ in range (threads):
for pill in iter(counter_q.get, None):
pbar.update(1)
def queue_to_list (q, total, n_poison_pills):
l=[]
with tqdm (mininterval=0.5) as pbar:
for _ in range (n_poison_pills):
for item in iter(q.get, None):
l.append (item)
pbar.update(1)
return l
df = kmer_per_seq_bundle (
seq_file = "valid_seq_list.txt",
nseq_per_bundle = 20000,
n_bundles = 500000,
start_offset = 6,
end_offset = 12,
kmer_len = 7,
min_score = 7,
threads = 20)
display(df.head())
outfile = "sampling_simulation_result.df"
df.to_pickle (outfile)
df = pd.read_pickle("sampling_simulation_result.df")
df.sort_values(["n_kmers", "min", "sd_count"], inplace=True, ascending=[True, False, True])
df.head(20)
I will select the line 99 which as 7 minimun occurences of each 7 mers and the lower kmer count sd of all.
# Get the index of the sequence from the winer combination
fn = "sampling_simulation_result.df"
best_index = 99
jprint("Load sampling simulation result", bold=True)
df = pd.read_pickle(fn)
best_values = df.loc[best_index]
print (best_values)
seq_id_list = set(best_values["seq_id_list"])
jprint("Inport all possible sequences", bold=True)
fn = "valid_seq_list.txt"
with open(fn) as fh:
seq_list = [seq.strip() for seq in fh]
jprint("Select wining sequences and write to output file", bold=True)
fn_txt = "DNA_test_sequence_panel_selection.txt"
fn_fa = "DNA_test_sequence_panel_selection.fa"
with open (fn_txt, "w") as fp_txt, open (fn_fa, "w") as fp_fa:
for seq_id in seq_id_list:
seq = seq_list[seq_id]
seq = seq.replace("U", "T")
fp_txt.write("{}\n".format(seq))
fp_fa.write(">ref_{}\n{}\n".format(seq_id, seq))
head(fn_fa)