TEST pycltools package

This notebook contains tests for the functions contained in pycltools package

In [167]:
# Jupyter specific imports
from IPython.core.display import display, HTML, Markdown
# Import of required packages
from os import remove
# import all the functions from pyCL
from pycltools.pycltools import *

JUPYTER NOTEBOOK SPECIFIC TOOLS

jhelp

In [168]:
help(jhelp)
Help on function jhelp in module pycltools.pycltools:

jhelp(function, full=True, print_private=False, **kwargs)
    Print a nice looking help string based on the name of a declared function. By default print the function
    definition and description
    * function
        Name of a declared function or class method
    * full
        If True, the help string will included a description of all arguments

In [169]:
jhelp(jhelp, full=True)

jhelp (function, full=True, print_private=False, **kwargs)

Print a nice looking help string based on the name of a declared function. By default print the function
definition and description
* function
Name of a declared function or class method
* full
If True, the help string will included a description of all arguments

jprint

In [170]:
jhelp(jprint, full=True)

jprint (*args, **kwargs)

FOR JUPYTER NOTEBOOK ONLY
Format a string in HTML and print the output. Equivalent of print, but highly customizable. Many options can be
passed to the function.
* *args
One or several objects that can be cast in str
* **kwargs
Formatting options to tweak the html rendering
Boolean options : bold, italic, highlight, underlined, striked, subscripted, superscripted
String oprions: font, color, size, align, background_color, line_height

In [45]:
txt="Lorem ipsum condimentum elementum sapien nam eleifend quisque sapien curae"
jprint(txt,font="sans", color="purple", size=200, bold=True)

Lorem ipsum condimentum elementum sapien nam eleifend quisque sapien curae

In [46]:
txt="Lorem ipsum\n\tcondimentum elementum\n\t\tsapien nam eleifend quisque\n\t\t\tsapien curae"
jprint(txt,font="sans", color="powderblue", size=200, bold=True, line_height=50)

Lorem ipsum
 condimentum elementum
  sapien nam eleifend quisque
   sapien curae

In [47]:
jprint("Lorem","ipsum","condimentum","elementum", 1, True, bold=False, italic=False, highlight=False, underlined=True, striked=False, subscripted=False, superscripted=False, font="calibri", color="grey", size=250, align="center")

Lorem ipsum condimentum elementum 1 True

toogle_code

In [171]:
jhelp(toogle_code, full=True)

toogle_code (**kwargs)

FOR JUPYTER NOTEBOOK ONLY
Hide code with a clickable link in a j
upyter notebook

In [49]:
#toogle_code()

larger_display

In [172]:
jhelp(larger_display, full=True)

larger_display (percent=100, **kwargs)

FOR JUPYTER NOTEBOOK ONLY
Resize the area of the screen containing the notebook according to a given percentage of the available width
* percent percentage of the width of the screen to use [DEFAULT:100]

In [173]:
larger_display(100)

PREDICATES

is_readable_file

In [174]:
jhelp(is_readable_file, full=True)

is_readable_file (fp, raise_exception=True, **kwargs)

Verify the readability of a file or list of file

In [53]:
try:
    is_readable_file("./data/KJHYTGYUJ")
    print ("OK")
except OSError as E:
    print(E)
./data/KJHYTGYUJ is not a valid file
In [54]:
try:
    is_readable_file("./data/RADAR_Secondary.txt")
    print ("OK")
except OSError as E:
    print(E)
OK

is_gziped

In [175]:
jhelp(is_gziped, full=True)

is_gziped (fp, **kwargs)

Return True if the file is Gziped else False

In [56]:
is_gziped("./data/RADAR_Secondary.txt")
Out[56]:
False
In [57]:
is_gziped("./data/RADAR_Secondary.txt.gz")
Out[57]:
True

has_extension

In [176]:
jhelp(has_extension, full=True)

has_extension (fp, ext, pos=-1, raise_exception=False, **kwargs)

Test presence of extension in a file path
* ext
Single extension name or list of extension names without dot. Example ["gz, "fa"]
* pos
Postition of the extension in the file path. -1 for the last, -2 for the penultimate and so on [DEFAULT -1 = Last position]

In [59]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "gz")
Out[59]:
True
In [60]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "fa")
Out[60]:
False
In [61]:
has_extension("./data/test/RADAR_Secondary.txt.gz", "txt", -2)
Out[61]:
True

PATH MANIPULATION

file_basename

In [177]:
jhelp(file_basename, full=True)

file_basename (fp, **kwargs)

Return the basename of a file without folder location and extension

In [63]:
file_basename("./data/RADAR_Secondary.txt.gz")
Out[63]:
'RADAR_Secondary'

extensions

In [178]:
jhelp(extensions, full=True)

extensions (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
the method will output the base extension + the archive extension as a string

In [65]:
print(extensions("./data/RADAR_Secondary.txt.gz"))
print(extensions("./data/RADAR_Secondary.txt"))
print(extensions("./data/RADAR_Secondary"))
.txt.gz
.txt

extensions_list

In [66]:
jhelp(extensions_list, full=True)

extensions_list (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
the method will output the base extension + the archive extension as a list

In [67]:
print(extensions_list("./data/RADAR_Secondary.txt.gz"))
print(extensions_list("./data/RADAR_Secondary.txt"))
print(extensions_list("./data/RADAR_Secondary"))
['txt', 'gz']
['txt']
[]

file_name

In [179]:
jhelp(file_name, full=True)

file_name (fp, **kwargs)

Return The complete name of a file with the extension but without folder location

In [69]:
file_name("./data/test/RADAR_Secondary.txt.gz")
Out[69]:
'RADAR_Secondary.txt.gz'

dir_name

In [180]:
jhelp(dir_name, full=True)

dir_name (fp, **kwargs)

Return the name of the directory where the file is located

In [71]:
print(dir_name("./data/test/RADAR_Secondary.txt.gz"))
print(dir_name("./__init__.py"))
print(dir_name("/bin/bash"))
test
.
bin

dir_path

In [72]:
jhelp(dir_path, full=True)

dir_path (fp, **kwargs)

Return the directory path of a file

In [73]:
print(dir_path("./data/test/RADAR_Secondary.txt.gz"))
print(dir_path("./__init__.py"))
print(dir_path("/bin/bash"))
./data/test
.
/bin

STRING FORMATTING

supersplit

In [181]:
jhelp(supersplit, full=True)

supersplit (string, separator='', **kwargs)

like split but can take a list of separators instead of a simple separator

In [75]:
a = "chr7\t74138\t774138\tA>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324\t0"

print(supersplit(a, ["\t","|"]))

print(supersplit(a))

print(supersplit(a, "|"))
['chr7', '74138', '774138', 'A>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324', '0']
['chr7', '74138', '774138', 'A>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324', '0']
['chr7\t74138\t774138\tA>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324\t0']

rm_blank

In [182]:
jhelp(rm_blank, full=True)

rm_blank (name, replace='', **kwargs)
Replace blank spaces in a name by a given character (default = remove)
Blanks at extremities are always removed and nor replaced

In [77]:
a = "chr\t\t17|LU NG:LYMPHOBLAST    OID_CELL_LINE|15342557:152585     96:22327324\t0"

print(rm_blank(a))

print(rm_blank(a, replace="*"))
chr17|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:223273240
chr*17|LU*NG:LYMPHOBLAST*OID_CELL_LINE|15342557:152585*96:22327324*0

FILE MANIPULATION

copyFile

In [183]:
jhelp(copyFile, full=True)

copyFile (src, dest, **kwargs)

Copy a single file to a destination file or folder (with error handling/reporting)
* src
Source file path
* dest
Path of the folder where to copy the source file

In [79]:
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/")
Error: './data/RADAR_Secondary.txt' and './data/RADAR_Secondary.txt' are the same file
In [80]:
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/RADAR_Secondary_copy.txt")

gzip_file

In [184]:
jhelp(gzip_file, full=True)

gzip_file (fpin, fpout=None, **kwargs)

gzip a file
* fpin
Path of the input uncompressed file
* fpout
Path of the output compressed file (facultative)

In [82]:
gzip_file("./data/RADAR_Secondary.txt")
Compressing ./data/RADAR_Secondary.txt
Out[82]:
'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt.gz'

gunzip_file

In [185]:
jhelp(gunzip_file, full=True)

gunzip_file (fpin, fpout=None, **kwargs)

ungzip a file
* fpin
Path of the input compressed file
* fpout
Path of the output uncompressed file (facultative)

In [84]:
gunzip_file("./data/RADAR_Secondary.txt.gz")
Uncompressing ./data/RADAR_Secondary.txt.gz
Out[84]:
'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt'

FILE INFORMATION

linerange

In [186]:
jhelp(linerange, full=True)

linerange (fp, range_list=[], line_numbering=True, max_char_line=150, **kwargs)

Print a range of lines in a file according to a list of start end lists. Handle gziped files
* fp
Path to the file to be parsed
* range_list
list of start, end coordinates lists or tuples
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line

In [86]:
file = "./data/RADAR_Secondary.txt"
linerange (file)
0	#location	reference	tissue	coverage	editing_level(%)
1	chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
2	chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86
In [87]:
file = "./data/gencode_sample.gff3"
linerange (file, [[2,5],[10,12],[98,100]], max_char_line=100)
...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
...
10	chr1	HAVANA	exon	30564	30667	.	+	.	ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=E...
11	chr1	HAVANA	exon	30976	31097	.	+	.	ID=exon:ENST00000473358.1:3;Parent=ENST00000473358.1;gene_id=E...
12	chr1	HAVANA	transcript	30267	31109	.	+	.	ID=ENST00000469289.1;Parent=ENSG00000243485.3;gene_id=EN...
...
98	chr1	HAVANA	exon	287517	287921	.	-	.	ID=exon:ENST00000335577.4:2;Parent=ENST00000335577.4;gene_id...
99	chr1	HAVANA	gene	357383	359681	.	-	.	ID=ENSG00000236743.1;gene_id=ENSG00000236743.1;gene_type=lin...
100	chr1	HAVANA	transcript	357383	359681	.	-	.	ID=ENST00000441866.1;Parent=ENSG00000236743.1;gene_id...
...
In [88]:
file = "./data/RADAR_Secondary.txt.gz"
linerange (file, line_numbering=False)
#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86

cat

In [187]:
jhelp(cat, full=True)

cat (fp, max_lines=100, line_numbering=False, max_char_line=150, **kwargs)

Emulate linux cat cmd but with line cap protection. Handle gziped files
* fp
Path to the file to be parsed
* max_lines
Maximal number of lines to print
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line

In [90]:
file = "./data/RADAR_Secondary.txt.gz"
cat (file, max_lines=10)
#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
chr1:1157460	Peng et al 2012	Lymphoblastoid cell line	66	22.73
chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
...
chr1:10521237	Peng et al 2012	Lymphoblastoid cell line	34	17.65
chr1:10521238	Peng et al 2012	Lymphoblastoid cell line	35	37.14
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86
In [91]:
file="./data/gencode_sample.gff3"
cat (file, max_lines=20, line_numbering=True, max_char_line=100)
0	##gff-version 3
1	#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - lo...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
6	##sequence-region chr1 1 248956422
7	chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRN...
8	chr1	HAVANA	transcript	29554	31097	.	+	.	ID=ENST00000473358.1;Parent=ENSG00000243485.3;gene_id=ENS...
9	chr1	HAVANA	exon	29554	30039	.	+	.	ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=EN...
...
9990	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000421147.5:3;Parent=ENST00000421147.5...
9991	chr1	HAVANA	transcript	221966410	221984964	.	+	.	ID=ENST00000441160.1;Parent=ENSG00000228437.5;...
9992	chr1	HAVANA	exon	221966410	221966502	.	+	.	ID=exon:ENST00000441160.1:1;Parent=ENST00000441160.1...
9993	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000441160.1:2;Parent=ENST00000441160.1...
9994	chr1	HAVANA	exon	221984054	221984964	.	+	.	ID=exon:ENST00000441160.1:3;Parent=ENST00000441160.1...
9995	chr1	HAVANA	gene	222041705	222064763	.	-	.	ID=ENSG00000232679.1;gene_id=ENSG00000232679.1;gene_...
9996	chr1	HAVANA	transcript	222041705	222064763	.	-	.	ID=ENST00000438158.1;Parent=ENSG00000232679.1;...
9997	chr1	HAVANA	exon	222064685	222064763	.	-	.	ID=exon:ENST00000438158.1:1;Parent=ENST00000438158.1...
9998	chr1	HAVANA	exon	222058414	222058678	.	-	.	ID=exon:ENST00000438158.1:2;Parent=ENST00000438158.1...
9999	chr1	HAVANA	exon	222041705	222041922	.	-	.	ID=exon:ENST00000438158.1:3;Parent=ENST00000438158.1...

tail

In [188]:
jhelp(tail, full=True)

tail (fp, n=10, line_numbering=False, max_char_line=150, **kwargs)

Emulate linux tail cmd. Handle gziped files
* fp
Path to the file to be parsed
* n
Number of lines to print starting from the end of the file
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line

In [93]:
file = "./data/RADAR_clean.txt"
tail (file, n = 4)
...
chr1	225974581	225974581	A>I|SRP9|YH|22327324	28.89	+
chr1	225974735	225974735	A>I|SRP9|YH|22327324	23.88	+
chr1	225974746	225974746	A>I|SRP9|YH|22327324	71.19	+
In [94]:
file = "./data/RADAR_Secondary.txt.gz"
tail (file, n = 4, line_numbering=True)
...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86
In [95]:
file="./data/gencode_sample.gff3"
tail (file, n = 5, max_char_line=100)
...
chr1	HAVANA	transcript	222041705	222064763	.	-	.	ID=ENST00000438158.1;Parent=ENSG00000232679.1;gene_...
chr1	HAVANA	exon	222064685	222064763	.	-	.	ID=exon:ENST00000438158.1:1;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222058414	222058678	.	-	.	ID=exon:ENST00000438158.1:2;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222041705	222041922	.	-	.	ID=exon:ENST00000438158.1:3;Parent=ENST00000438158.1;gene...
In [189]:
jhelp(head, full=True)

head (fp, n=10, ignore_comment_line=False, comment_char='#', max_char_line=200, sep='\t', max_char_col=50, **kwargs)

Emulate linux head cmd. Handle gziped files and bam files
* fp
Path to the file to be parsed. Works with text, gunziped and binary bam/sam files
* n
Number of lines to print starting from the begining of the file (Default 10)
* ignore_comment_line
Skip initial lines starting with a specific character. Pointless for bam files(Default False)
* comment_char
Character or string for ignore_comment_line argument (Default "#")
* max_char_line
Maximal number of character to print per line (Default 150)

In [97]:
head("./data/RADAR_Main.txt", n= 3)
#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp conservation_rhesus conservation_mouse 
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N                   N                  
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N                   N                  

In [98]:
head("./data/RADAR_Main.txt", ignore_comment_line=True,n= 3)
chr1 206256301 C1orf186   - intronic   intronic   no no N N N 
chr6 116991832 intergenic - intergenic intergenic no no N N N 
chr7 30504355  NOD1       - intronic   intronic   no no N N N 

In [99]:
head("./data/RADAR_Main.txt", n=5, max_char_line=110)
#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp cons...
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N   ...
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N   ...
chr7        30504355  NOD1       -      intronic   intronic   no   no                  N                  N   ...
chr1        85127959  SSX2IP     -      Syn        Gln->Gln   no   no                  N                  N   ...

In [100]:
head("./data/RADAR_Secondary.txt.gz", n=6, ignore_comment_line=True)
chr1:1037916 Peng et al 2012 Lymphoblastoid cell line 9  66.67 
chr1:1156882 Peng et al 2012 Lymphoblastoid cell line 42 36.59 
chr1:1157460 Peng et al 2012 Lymphoblastoid cell line 66 22.73 
chr1:1252441 Peng et al 2012 Lymphoblastoid cell line 11 72.73 
chr1:1252443 Peng et al 2012 Lymphoblastoid cell line 11 45.45 
chr1:1253357 Peng et al 2012 Lymphoblastoid cell line 31 32.26 

In [101]:
head("./data/sample.sam", n=6, ignore_comment_line=True)
chr1|35235|35295|-|5.1   272 chr12 37283     0 61M * 0 0 *                                                  *                                                  
chr1|90965|91025|-|7.57  256 chr16 90215899  0 61M * 0 0 *                                                  *                                                  
chr1|91055|91115|-|7.60  256 chr2  168290980 0 61M * 0 0 *                                                  *                                                  
chr1|92081|92141|-|8.1   272 chr1  268657    0 61M * 0 0 *                                                  *                                                  
chr1|92111|92171|-|8.2   256 chr5  181462264 0 61M * 0 0 *                                                  *                                                  
chr1|110943|111003|-|9.1 0   chrY  24307299  0 61M * 0 0 AATGAAAGATATGTGTTTTTCATATTACCAGGTAGATGATAAGGAGATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 

In [102]:
head ("./data/sample_100.bam", n=6)
chr1|1736694|1736754|-|168.51      256 chr6  108404793 0  32M29H   * 0 0 *                                                  *                                                  
chr1|20158612|20158672|+|508.32    0   chr1  20158612  60 61M      * 0 0 CTCAGAGGCTTGAAAAGTAGCATCCACCCCCTTCTGGGCATCAATCACAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|47096793|47096853|-|1008.6    272 chr1  156061950 0  2H54M5H  * 0 0 *                                                  *                                                  
chr1|65003940|65004000|-|1364.17   256 chr13 107349700 0  16M1I44M * 0 0 *                                                  *                                                  
chr1|108202106|108202166|+|1958.74 0   chr1  108202106 60 61M      * 0 0 GGACAGAAAACAAATCAGTAGTTACCAGTTGTGACTAGCGGGAAGGGAAT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|147173091|147173151|+|2353.12 272 chr2  74122749  0  22H39M   * 0 0 *                                                  *                                                  

linesample

In [103]:
jhelp(linesample, full=True)

linesample (fp, n_lines=100, line_numbering=True, max_char_line=150, **kwargs)

Randomly sample lines in a file and print them. Handle gziped files
* fp
Path to the file to be parsed
* n_lines
Number of lines to sample in the file
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line

In [104]:
linesample("./data/RADAR_clean.txt", n_lines=10, line_numbering=True)
77	chr1	6710595	6710595	A>I|DNAJC11|YH|22327324	50.00	-
96	chr1	10521237	10521237	A>I|DFFA|YH|22327324	17.65	-
266	chr1	32737172	32737172	A>I|LCK|YH|22327324	35.71	+
342	chr1	40205396	40205396	A>I|PPIE|YH|22327324	63.64	+
448	chr1	52875019	52875019	A>I|PRPF38A|YH|22327324	38.89	+
533	chr1	85449497	85449497	A>I|MCOLN2|YH|22327324	20.00	-
610	chr1	114296188	114296188	A>I|PHTF1|YH|22327324	20.59	-
767	chr1	155444343	155444343	A>I|ASH1L|YH|22327324	42.86	-
824	chr1	157516004	157516004	A>I|FCRL5|YH|22327324	21.88	-
946	chr1	204526795	204526795	A>I|MDM4|YH|22327324	32.29	+
In [105]:
linesample("./data/RADAR_Secondary.txt.gz", n_lines=10, line_numbering=True)
4	chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
8	chr1:1418532	Peng et al 2012	Lymphoblastoid cell line	5	60.00
51	chr1:6608345	Peng et al 2012	Lymphoblastoid cell line	13	46.15
56	chr1:6707305	Peng et al 2012	Lymphoblastoid cell line	33	39.39
61	chr1:6708354	Peng et al 2012	Lymphoblastoid cell line	15	40.00
62	chr1:6708680	Peng et al 2012	Lymphoblastoid cell line	24	25.00
63	chr1:6708681	Peng et al 2012	Lymphoblastoid cell line	24	20.83
75	chr1:6710585	Peng et al 2012	Lymphoblastoid cell line	30	65.52
90	chr1:10520702	Peng et al 2012	Lymphoblastoid cell line	98	11.22
93	chr1:10520751	Peng et al 2012	Lymphoblastoid cell line	166	28.92

count_uniq

In [106]:
jhelp(count_uniq, full=True)

count_uniq (fp, colnum, select_values=None, drop_values=None, skip_comment='#', sep='\t', **kwargs)

Count unique occurences in a specific column of a tabulated file
* fp
Path to the file to be parsed (gzipped or not)
* colnum
Index number of the column to summarize
* select_values
Select specific lines in the file based on a dictionary containing column index(es) and valu(es) or list
of values to select. Exemple {2:["exon", "transcript"], 4:"lincRNA"}. DEFAULT=None
* drop_values
Same think that select_value but will drop the lines instead. DEFAULT=None
* skip_comment
Drop any comment lines starting with this character. DEFAULT="#"
* sep
Character or list of characters to use in order to split the lines. Exemple [" ",";"]. DEFAULT=" "

In [107]:
count_uniq("./data/Small_editing_Peng_hg38.bed", colnum=17, sep=['\t',"|"])
Out[107]:
17
intergenic    110
intron         55
3-UTR          17
unknown        12
dtype: int64
In [108]:
count_uniq("./data/gencode_sample.gff3", colnum=17, sep=["\t","=", ";"], select_values={2:["transcript", "exon"], 6:"+"})
Out[108]:
17
lincRNA                     2031
antisense                   1600
processed_transcript         686
sense_intronic               105
TEC                           36
sense_overlapping             11
3prime_overlapping_ncrna       2
dtype: int64

colsum

In [190]:
jhelp(colsum, full=True)

colsum (fp, colrange=None, separator='', header=False, ignore_hashtag_line=False, max_items=10, ret_type='md', **kwargs)

Create a summary of selected columns of a file
* fp
Path to the file to be parsed
* colrange
A list of column index to parse
* separator
A character or a list of characters to split the lines
* ignore_hashtag_line
skip line starting with a # symbol
* max_items
maximum item per line
* ret_type
Possible return types:
md = markdown formatted table,
dict = raw parsing dict,
report = Indented_text_report

In [110]:
display(Markdown(colsum("./data/RADAR_Main.txt", header=True, colrange=[0,2,6], max_items=15)))
#chromosome chr1 chr17 chr9 chr15 chr6 chr14 chr18 chr2 chrY chr4 chr7
Count 4 3 2 2 2 1 1 1 1 1 1
gene RABEP1 NUP133 JUB GREB1L SPHKAP NLGN4Y CELSR2 RBPJ TLE4 SOCS7 ADPGK UBE2O TSC1 GRIK2 MEF2A ...
Count 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...
alu? no
Count 19
In [111]:
colsum("./data/RADAR_Main.txt", header=True, ret_type="dict", colrange=[0,3])
Out[111]:
OrderedDict([(0,
              OrderedDict([('chr1', 4),
                           ('chr6', 2),
                           ('chr7', 1),
                           ('chr15', 2),
                           ('chr9', 2),
                           ('chr17', 3),
                           ('chr4', 1),
                           ('chrY', 1),
                           ('chr2', 1),
                           ('chr18', 1),
                           ('chr14', 1)])),
             (3, OrderedDict([('-', 10), ('+', 9)]))])
In [112]:
print(colsum(
        "./data/RADAR_clean.txt",
        header=True,
        ignore_hashtag_line=True,
        ret_type="report",
        separator=["\t","|"],
        max_items=5))
0
	chr1	997
1
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
2
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
3
	A>I	997
4
	FDPS	34
	MDM4	31
	CTSS	28
	DNAJC11	25
	S100PBP	24
	...	...
5
	YH	997
6
	22327324	997
7
	33.33	31
	66.67	31
	50.00	23
	57.14	22
	60.00	22
	...	...
8
	-	527
	+	470

fastcount

In [191]:
jhelp(fastcount, full=True)

fastcount (fp, **kwargs)

Efficient way to count the number of lines in a file. Handle gziped files

In [114]:
fastcount("./data/RADAR_Secondary.txt")
Out[114]:
100
In [115]:
fastcount("./data/RADAR_Secondary.txt.gz")
Out[115]:
100

simplecount

In [192]:
jhelp(simplecount, full=True)

simplecount (fp, ignore_hashtag_line=False, **kwargs)

Simple way to count the number of lines in a file with more options

In [117]:
simplecount("./data/Small_m5C_Squires_hg38.bed", ignore_hashtag_line=True)
Out[117]:
194
In [118]:
simplecount("./data/RADAR_Secondary.txt.gz")
Out[118]:
100

DIRECTORY MANIPULATION

mkdir

In [193]:
jhelp(mkdir, full=True)

mkdir (fp, level=1, **kwargs)

Reproduce the ability of UNIX "mkdir -p" command
(ie if the path already exits no exception will be raised).
Can create nested directories by recursivity
* fp
path name where the folder should be created
* level
level in the path where to start to create the directories. Used by the program for the recursive creation of
directories

In [120]:
mkdir("./data/test_dir")
In [121]:
mkdir ("./test/test/test")
!rm -rf ./test
Creating /home/aleg/Programming/pycltools/docs/test
Creating /home/aleg/Programming/pycltools/docs/test/test
Creating /home/aleg/Programming/pycltools/docs/test/test/test

SHELL MANIPULATION

make_cmd_str

In [194]:
jhelp(make_cmd_str, full=True)

make_cmd_str (prog_name, opt_dict={}, opt_list=[], **kwargs)

Create a Unix like command line string from the prog name, a dict named arguments and a list of unmammed arguments
exemple make_cmd_str("bwa", {"b":None, t":6, "i":"../idx/seq.fa"}, ["../read1", "../read2"])
* prog_name
Name (if added to the system path) or path of the program
* opt_dict
Dictionary of option arguments such as "-t 5". The option flag have to be the key (without "-") and the the
option value in the dictionary value. If no value is requested after the option flag "None" had to be assigned
to the value field.
* opt_list
List of simple command line arguments

In [123]:
make_cmd_str("bwa", {"-b":None, "-t":6, "-i":"../idx/seq.fa"}, ["../read1", "../read2"])
Out[123]:
'bwa -b -t 6 -i ../idx/seq.fa ../read1 ../read2 '

bash_basic

In [195]:
jhelp(bash_basic, full=True)

bash_basic (cmd, virtualenv=None, **kwargs)

Sent basic bash command
* cmd
A command line string formatted as a string
* virtualenv
If specified will try to load a virtualenvwrapper environment before runing the command

In [125]:
print(bash_basic("ls -l"))
print(bash_basic("echo TTTT"))
print(bash_basic("grep ./data/RADAR_Secondary.txt"))
total 136
drwxrwxr-x 3 aleg aleg  4096 Dec 10 10:54 data
-rw-rw-r-- 1 aleg aleg 39582 Dec 10 12:05 pycltools_functions_list.ipynb
-rw-rw-r-- 1 aleg aleg 93686 Dec 10 12:07 pycltools_tests.ipynb


None
TTTT


None


None

bash

In [196]:
jhelp(bash, full=True)

bash (cmd, virtualenv=None, live='stdout', print_stdout=True, ret_stdout=False, log_stdout=None, print_stderr=True, ret_stderr=False, log_stderr=None, print_cmd=False, dry=False, **kwargs)

More advanced version of bash calling with live printing of the standard output and possibilities to log the
redirect the output and error as a string return or directly in files. If ret_stderr and ret_stdout are True a
tuple will be returned and if both are False None will be returned
* cmd
A command line string formatted as a string
* virtualenv
If specified will try to load a virtualenvwrapper environment before runing the command
* print_stdout
If True the standard output will be LIVE printed through the system standard output stream
* ret_stdout
If True the standard output will be returned as a string
* log_stdout
If a filename is given, the standard output will logged in this file
* print_stderr
If True the standard error will be printed through the system standard error stream
* ret_stderr
If True the standard error will be returned as a string
* log_stderr
If a filename is given, the standard error will logged in this file

In [127]:
bash("ls", print_stdout=True, ret_stdout=True,)
data
pycltools_functions_list.ipynb
pycltools_tests.ipynb
Out[127]:
'data\npycltools_functions_list.ipynb\npycltools_tests.ipynb\n'
In [128]:
bash("for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done", live="stderr",  print_stdout=True, ret_stdout=True, print_stderr=True)
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
Error code #2 during execution of the command : for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done
In [129]:
bash("ls", print_stdout=False, ret_stdout=False, log_stdout="./data/stdout.txt")
head("./data/stdout.txt")
Only 3 lines in the file
data                           
pycltools_functions_list.ipynb 
pycltools_tests.ipynb          

bash_update

In [197]:
jhelp(bash_update, full=True)

bash_update (cmd, update_freq=1, **kwargs)

FOR JUPYTER NOTEBOOK
Run a bash command and print the output in the cell. The output is updated each time until the output is None.
This is suitable for monitoring tasks that log events until there is nothing else to print such as bjobs or bpeeks.
* cmd
A command line string formatted as a string
* update_freq
The frequency of output updating in seconds [DEFAULT: 1]

In [131]:
#bash_update("htop")

DICTIONNARY FORMATTING

dict_to_md

In [198]:
jhelp(dict_to_md, full=True)

dict_to_md (d, key_label='', value_label='', transpose=False, sort_by_key=False, sort_by_val=True, max_items=None, **kwargs)

Transform a dict into a markdown formated table

In [133]:
d = {"a":12,"b":14,"c":8,"d":56,"e":76}
display(Markdown(dict_to_md(d, "Letter", "Number", sort_by_val=True)))
display(Markdown(dict_to_md(d, "Letter", "Number", transpose=True, max_items=3)))
Letter Number
e 76
d 56
b 14
a 12
c 8
Letter e d b ...
Number 76 56 14 ...

dict_to_report

In [199]:
jhelp(dict_to_report, full=True)

dict_to_report (d, tab='\t', ntab=0, sep=':', sort_dict=True, max_items=None, **kwargs)

Recursive function to return a text report from nested dict or OrderedDict objects

In [135]:
d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab=" | "))

d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765, "c2.3":7533,"c2.4":76433,"c2.5":876543,"c2.6":89765,"c2.7":8654},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab="--", max_items=4, sort_dict=True))
a:12
b:14
c
 | c1:12
 | c2
 |  | c2.1:33221
 |  | c2.2:765
 | c3:32
 | c4:443
d:56
e:76

a:12
b:14
c
--c1:12
--c2
----c2.5:876543
----c2.6:89765
----c2.4:76433
----c2.1:33221
----...:...
--c3:32
--c4:443
d:56
e:76


TABLE FORMATTING

reformat_table

In [200]:
jhelp(reformat_table, full = True)

reformat_table (input_file, output_file='', return_df=False, init_template=[], final_template=[], header='', keep_original_header=True, header_from_final_template=False, replace_internal_space='_', replace_null_val='*', subst_dict={}, filter_dict=[], predicate=None, standard_template=None, verbose=False, **kwargs)

Reformat a table given an initial and a final line templates indicated as a list where numbers
indicate the data column and strings the formatting characters

* input_file
A file with a structured text formatting (gzipped or not)
* output_file
A file path to output the reformatted table (if empty will not write in a file)
* return_df
If true will return a pandas dataframe containing the reformated table (Third party pandas package required)
by default the columns will be names after the final template [DEFAULT:False]
* init_template
A list of indexes and separators describing the structure of the input file
Example initial line = "chr1 631539 631540 Squires|id1 0 +"
Initial template = [0," ",1," ",2," ",3,"|",4," ",5," ",6]
Alternatively, instead of the numbers, string indexes can be used, but they need to be enclosed in curly
brackets to differentiate them from the separators. This greatly simplify the writing of the final template.
Example initial line = "chr1 631539 631540 Squires|id1 0 +"
Initial template = ["{chrom}"," ","{start}"," ","{end}","|","{name}"," ","{score}"," ","{strand}"]
* final_template
A list of indexes and separators describing the required structure of the output file. Name indexes need to
match indexes of the init_template and have to follow the same synthax [DEFAULT:Same that init template]
Example final line = "chr1 631539 631540 m5C|-|HeLa|22344696 - -"
Final template = [0," ",1," ",2," m5C|-|HeLa|22344696 - ",6]
* header
A string to write as a file header at the beginning of the file
* keep_original_header
If True the original header of the input file will be copied at the beginning of the output file [DEFAULT:True]
* header_from_final_template
Generate a header according to the name or number of the fields given in the final_template [DEFAULT:True]
* replace_internal_space
All internal blank space will be replaced by this character [DEFAULT:"_"]
* replace_null_val
Field with no value will be replaced by this character [DEFAULT:"*"]
* subst_dict
Nested dictionary of substitution per position to replace specific values by others [DEFAULT:None]
Example: { 0:{"chr1":"1","chr2":"2"}, 3:{"Squires":"5376774764","Li":"27664684"}}
* filter_dict
A dictionary of list per position to filter out lines with specific values [DEFAULT:None]
Example: { 0:["chr2", "chr4"], 1:["46767", "87765"], 5:["76559", "77543"]}
* predicate
A lambda predicate function for more advance filtering operations [DEFAULT:None]
Example: lambda val_dict: abs(int(val_dict[1])-int(val_dict[2])) <= 2000
* standard_template
Existing standard template to parse the file instead of providing one manually. List of saved templates:
- "gff3_ens_gene" = Template for ensembl gff3 fields. Select only the genes lines and decompose to individual elements.
- "gff3_ens_transcript" = Template for ensembl gff3 fields. Select only the transcript lines and decompose to individual elements.
- "gtf_ens_gene" = Template for ensembl gft fields. Select only the genes lines and decompose to individual elements
* verbose
If True will print detailed information [DEFAULT:False]

In [137]:
# With numeric index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6],
    final_template=[0,"\t",1,"\t",2,"\tm5C|*|HeLa|22344696\t-\t",6],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n"
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")
0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+
0	# New header
1	chr1	631539	631540	m5C|*|HeLa|22344696	-	+
2	chr1	631540	631541	m5C|*|HeLa|22344696	-	+
...
192	chr1	19311959	19311960	m5C|*|HeLa|22344696	-	-
193	chr1	19608342	19608343	m5C|*|HeLa|22344696	-	+
194	chr1	19608343	19608344	m5C|*|HeLa|22344696	-	+
In [138]:
# With str index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=["{chrom}","\t","{start}","\t","{end}","|","{name}","\t","{score}","\t","{strand}"],
    final_template=["{start}","\t","{end}","\tadditional_informations\t","{name}"],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n",
    verbose=True
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")
Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: False
	header: # New header

	final_template: ['{start}', '\t', '{end}', '\tadditional_informations\t', '{name}']
	init_template: ['{chrom}', '\t', '{start}', '\t', '{end}', '|', '{name}', '\t', '{score}', '\t', '{strand}']
	return_df: False
	output_file: ./data/Small_m5C_Squires_hg38_reformat.bed
	input_file: ./data/Small_m5C_Squires_hg38.bed
Unenumerated named arguments list:
Initial template values
chrom	start	end|name	score	strand
Final template values
start	end	additional_informations	name
194 Lines processed	194 Lines pass	0 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+
0	# New header
1	631539	631540	Squires	additional_informations	id1
2	631540	631541	Squires	additional_informations	id2
...
192	19311959	19311960	Squires	additional_informations	id185
193	19608342	19608343	Squires	additional_informations	id186
194	19608343	19608344	Squires	additional_informations	id187
In [139]:
subst_dict = {0:{"chr1":"1", "chr2":"2"}, 3:{"Peng":"22344696"}}
filter_dict = {18:["intron"]}
input_file="./data/Small_editing_Peng_hg38.bed"
output_file="./data/Small_editing_Peng_hg38_reformat.bed"

reformat_table(
    input_file, output_file,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    final_template=[0,"\t",1,"\t",2,"\t",9,">",10,"|",3,"|HeLa|",19,"\t",11,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    subst_dict = subst_dict,
    filter_dict = filter_dict,
    verbose=True
    )

linerange (input_file)
linerange (output_file)
Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: {18: ['intron']}
	subst_dict: {0: {'chr1': '1', 'chr2': '2'}, 3: {'Peng': '22344696'}}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: [0, '\t', 1, '\t', 2, '\t', 9, '>', 10, '|', 3, '|HeLa|', 19, '\t', 11, '\t', 21]
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: False
	output_file: ./data/Small_editing_Peng_hg38_reformat.bed
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	9>10|3|HeLa|19	11	21
194 Lines processed	139 Lines pass	55 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#
...
197	chr1	9173454	9173455	Peng|chr1|9156101|-|T|Y|A->G|35.14%|99|T|24|C|13|37|intergenic|-	0	-
198	chr1	9173533	9173534	Peng|chr1|9156180|-|T|Y|A->G|24.10%|61|T|148|C|47|195|intergenic|-	0	-
199	chr1	9173535	9173536	Peng|chr1|9156182|-|T|Y|A->G|66.15%|99|C|129|T|66|195|intergenic|-	0	-
0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#
...
142	1	9173454	9173455	A>G|22344696|HeLa|-	35.14	-
143	1	9173533	9173534	A>G|22344696|HeLa|-	24.10	-
144	1	9173535	9173536	A>G|22344696|HeLa|-	66.15	-
In [140]:
input_file="./data/Small_editing_Peng_hg38.bed"

df = reformat_table(
    input_file,
    return_df=True,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    verbose=True)

print(head(input_file, 11))

df.head()
Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: []
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: True
	output_file: 
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
No final template given. Create final template from init template
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
# Transcriptome-wide map of editing sites [hg38 coordinates]
# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
#
# Data cleaned and converted to BED6, coordinate conversion to hg38 using liftOver.
# Maintainer: Maurits Evers (maurits.evers@anu.edu.au)
#
chr1	1102535	1102536	Peng|chr1|1027779|-|T|Y|A->G|66.67%|37|C|6|T|3|9|intron|C1orf159	0	-
chr1	1221501	1221502	Peng|chr1|1146745|-|T|Y|A->G|36.59%|99|T|26|C|15|42|intron|SDF4	0	-
chr1	1222079	1222080	Peng|chr1|1147323|-|T|Y|A->G|22.73%|94|T|51|C|15|66|intron|SDF4	0	-
chr1	1251840	1251841	Peng|chr1|1177084|-|T|Y|A->G|56.25%|99|C|9|T|7|16|intergenic|-	0	-
chr1	1252243	1252244	Peng|chr1|1177487|-|T|Y|A->G|19.44%|30|T|29|C|7|36|intergenic|-	0	-

None
Out[140]:
0 1 2 3 4 5 6 7 8 9 ... 12 13 14 15 16 17 18 19 20 21
0 chr1 1102535 1102536 Peng chr1 1027779 - T Y A ... 37 C 6 T 3 9 intron C1orf159 0 -
1 chr1 1221501 1221502 Peng chr1 1146745 - T Y A ... 99 T 26 C 15 42 intron SDF4 0 -
2 chr1 1222079 1222080 Peng chr1 1147323 - T Y A ... 94 T 51 C 15 66 intron SDF4 0 -
3 chr1 1251840 1251841 Peng chr1 1177084 - T Y A ... 99 C 9 T 7 16 intergenic - 0 -
4 chr1 1252243 1252244 Peng chr1 1177487 - T Y A ... 30 T 29 C 7 36 intergenic - 0 -

5 rows × 22 columns

In [141]:
input_file = "./data/gencode_sample.gff3"

df = reformat_table(
    input_file,
    return_df=True,
    standard_template="gff3_ens_transcript", 
    keep_original_header=False,
    header_from_final_template= True,
    verbose=True
    )

print(head(input_file, 11))
df.head()
Enumerated named argument list:
	verbose: True
	standard_template: gff3_ens_transcript
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: True
	keep_original_header: False
	header: 
	final_template: []
	init_template: []
	return_df: True
	output_file: 
	input_file: ./data/gencode_sample.gff3
Unenumerated named arguments list:
Using gff3 ensembl transcript template. Non-transcript features will be filtered out
No final template given. Create final template from init template
Initial template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
Final template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - long non-coding RNAs
#provider: GENCODE
#contact: gencode-help@sanger.ac.uk
#format: gff3
#date: 2015-12-03
##sequence-region chr1 1 248956422
chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P13.3;level=2;tag=ncRNA_host;havana_gene=OTTHUMG00000000959.2
chr1	HAVANA	transcript	29554	31097	.	+	.	ID=ENST00000473358.1;Parent=ENSG00000243485.3;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P1...
chr1	HAVANA	exon	29554	30039	.	+	.	ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P...
chr1	HAVANA	exon	30564	30667	.	+	.	ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P...

None
Out[141]:
seqid source type start end score strand phase ID Parent ... gene_status gene_name transcript_type transcript_status transcript_name level transcript_support_level tag havana_gene havana_transcript
0 chr1 HAVANA transcript 29554 31097 . + . ENST00000473358.1 ENSG00000243485.3 ... KNOWN RP11-34P13.3 lincRNA KNOWN RP11-34P13.3-001 2 5 not_best_in_genome_evidence,dotter_confirmed,b... OTTHUMG00000000959.2 OTTHUMT00000002840.1
1 chr1 HAVANA transcript 30267 31109 . + . ENST00000469289.1 ENSG00000243485.3 ... KNOWN RP11-34P13.3 lincRNA KNOWN RP11-34P13.3-002 2 5 not_best_in_genome_evidence,basic OTTHUMG00000000959.2 OTTHUMT00000002841.2
2 chr1 HAVANA transcript 34554 36081 . - . ENST00000417324.1 ENSG00000237613.2 ... KNOWN FAM138A lincRNA KNOWN FAM138A-001 2 1 basic OTTHUMG00000000960.1 OTTHUMT00000002842.1
3 chr1 HAVANA transcript 35245 36073 . - . ENST00000461467.1 ENSG00000237613.2 ... KNOWN FAM138A lincRNA KNOWN FAM138A-002 2 3;havana_gene=OTTHUMG00000000960.1;havana_tran... * * *
4 chr1 HAVANA transcript 89295 120932 . - . ENST00000466430.5 ENSG00000238009.6 ... KNOWN RP11-34P13.7 lincRNA KNOWN RP11-34P13.7-001 2 5 not_best_in_genome_evidence,basic OTTHUMG00000001096.2 OTTHUMT00000003225.1

5 rows × 23 columns

WEB TOOLS

url_exist

In [201]:
jhelp(url_exist, full=True)

url_exist (url, **kwargs)

Predicate verifying if an url exist without downloading all the link

In [143]:
url_exist("http://www.google.com") # When this one will be False it will probably be the end of the world
Out[143]:
True
In [144]:
url_exist("http://www.JUYGKUYHGJHFJ.com")
Out[144]:
True

wget

In [202]:
jhelp(wget, full=True)

wget (url, out_name='', progress_block=100000000, **kwargs)

Download a file from an URL to a local storage.
* url
A internet URL pointing to the file to download
* outname
Name of the outfile where (facultative)
* progress_block
size of the byte block for the progression of the download

In [146]:
outfile = wget("")
if outfile:
    print(outfile)
    remove(outfile)
unknown url type: ''
In [147]:
outfile = wget("https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig", "test.bigWig", 50000000)
if outfile:
    print(outfile)
    remove(outfile)
Downloading: https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig	Bytes: 258930225
50.0 MB Downloaded	[19.31 %]
100.0 MB Downloaded	[38.62 %]
150.0 MB Downloaded	[57.93 %]
200.0 MB Downloaded	[77.24 %]
250.0 MB Downloaded	[96.55 %]
258.9 MB Downloaded	[100 %]
test.bigWig

FUNCTION TOOLS

In [203]:
jhelp(print_arg, full=True)

print_arg (**kwargs)

Print calling function named and unnamed arguments

In [149]:
def test (A,B,C=7,*args, **kwarg):
    print_arg()

test(1,2,3,5, z=65, x=100)
Enumerated named argument list:
	C: 3
	B: 2
	A: 1
Unenumerated named arguments list:
	z: 65
	x: 100
Unnamed positional arguments list:
	5

SSH TOOLS

scp

In [204]:
jhelp(scp, full=True)

scp (hostname, local_file, remote_dir, username=None, rsa_private_key=None, ssh_config='~/.ssh/config', verbose=False, **kwargs)

Copy a file over ssh in a target remote directory
* hostname
Name of the host ssh server
* username
name of the user
* rsa_private_key
path to the rsa private key
* local_file
path to the local file
* remote_dir
path to the target directory
* ssh_config
use as an alternative method instead of giving the username and rsa_private_key. Will fetch them from the config file directly

In [151]:
#scp(hostname="ebi-cli-001.ebi.ac.uk", local_file="../README.md", remote_dir="~/test", username="aleg", rsa_private_key="/home/aleg/.ssh/ebi_rsa")
In [152]:
#scp(hostname="ebi", local_file="../README.md", remote_dir="~/test")

Package Tools

get_package_file

In [153]:
jhelp(get_package_file, full=True)

get_package_file (package, fp='', **kwargs)

Verify the existence of a file from the package data and return a file path
* package
Name of the package
* fp
Relative path to the file in the package. Usually package_name/data/file_name
if the path points to a directory the directory arborescence will be printed

In [154]:
get_package_file("pyCL", "pyCL/")
/home/aleg/Programming/pycltools/pycltools/pycltools.py:1947: UserWarning: The 'pyCL' distribution was not found and is required by the application
  warnings.warn(str(E))

SAM/BAM TOOLS

bam_sample

In [155]:
jhelp(bam_sample, full=True)

bam_sample (fp_in, fp_out, n_reads, verbose=False, **kwargs)

Sample reads from a SAM/BAM file and write in a new file
* fp_in
Path to the input file in .bam/.sam/.cram (the format will be infered from extension)
* fp_out
Path to the output file in .bam/.sam/.cram (the format will be infered from extension)
* n_reads
number of reads to sample

In [156]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.sam", n_reads=100, verbose=True)
linesample("./data/sample_100.sam", n_lines=10, max_char_line=100)
Found 5000 reads in input file
Wrote 100 reads in output file
20	@SQ	SN:chr21	LN:46709983
44	@SQ	SN:KI270305.1	LN:1472
111	@SQ	SN:KI270508.1	LN:1951
146	@SQ	SN:KI270710.1	LN:40176
170	@SQ	SN:KI270734.1	LN:165050
171	@SQ	SN:KI270735.1	LN:42811
217	chr14|61657775|61657835|+|13447.7	272	chr7	127489894	0	61M	*	0	0	*	*	NM:i:3	MD:Z:39A14A4A1	AS:i:49
234	chr17|43159683|43159737|-|19991.10	272	chr9	131908717	0	55M	*	0	0	*	*	NM:i:0	MD:Z:55	AS:i:55
239	chr18|14010134|14010194|+|21568.4	272	chr5	4925139	0	61M	*	0	0	*	*	NM:i:0	MD:Z:61	AS:i:61
266	chr3|138485055|138485115|+|33361.101	256	chr12	6132886	0	61M	*	0	0	*	*	NM:i:5	MD:Z:16G4C0A3C25G8	AS:...
In [157]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)
!samtools view "./data/sample_100.bam" | head
Found 5000 reads in input file
Wrote 100 reads in output file
chr1|805036|805096|+|89.10	272	chr8	436410	0	61M	*	0	0	*	*	NM:i:3	MD:Z:7A19C0A32	AS:i:46
chr1|110408997|110409057|+|2013.22	272	chr15	35143322	0	13H48M	*	0	0	*	*	NM:i:3	MD:Z:37G0A2T6	AS:i:37
chr1|121462469|121462529|+|2240.83	0	chr1	121462469	48	61M	*	0	0	AATCTATTTATTTATTTTTCTTCAGTGTTACAATGAAACAACATTGCTTTATTTAAATTTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	MD:Z:61	AS:i:61	XS:i:46
chr1|205386423|205386483|+|3446.41	272	KI270750.1	53599	0	7H47M7H	*	0	0	*	*	NM:i:3	MD:Z:27C0A6A11	AS:i:32
chr1|221508699|221508759|+|3731.6	272	chrX	69857918	0	37M24H	*	0	0	*	*	NM:i:0	MD:Z:37	AS:i:37
chr1|246607871|246607931|+|4121.10	256	chr19	29557507	0	19H42M	*	0	0	*	*	NM:i:0	MD:Z:42	AS:i:42
chr10|14878128|14878188|-|4488.16	256	chr15	84959120	0	17H44M	*	0	0	*	*	NM:i:0	MD:Z:44	AS:i:44
chr10|65751058|65751118|+|5083.9	272	chr2	222774610	0	18H43M	*	0	0	*	*	NM:i:1	MD:Z:6G36	AS:i:38
chr10|106187699|106187759|+|5744.7	272	chr10	73831305	0	61M	*	0	0	*	*	NM:i:5	MD:Z:0T44C4T3A5C0	AS:i:44
chr10|125698897|125698957|+|5980.5	0	chr10	125698897	60	61M	*	0	0	AGGTGGGCTCCATTTGGCCTCCTTCCTTGGTCCATTCTCATCTTCCTGGGCCCTGCGGATG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	MD:Z:61	AS:i:61	XS:i:0
In [158]:
bam_sample("./data/sample.txt", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)
/home/aleg/Programming/pycltools/pycltools/pycltools.py:1990: UserWarning: Invalid input file format (.bam/.sam/.cram)
  warnings.warn ("Invalid input file format (.bam/.sam/.cram)")
In [159]:
bam_sample("./data/sample.sam", fp_out="./data/sample_100.txt", n_reads=100, verbose=True)
/home/aleg/Programming/pycltools/pycltools/pycltools.py:1999: UserWarning: Invalid output file format (.bam/.sam/.cram)
  warnings.warn ("Invalid output file format (.bam/.sam/.cram)")

DNA SEQUENCE TOOLS

base_generator

In [205]:
jhelp(base_generator, full = True)

base_generator (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], **kwargs)

Generator returning DNA/RNA bases according to a probability weightning
* bases: list (default ["A","T","C","G"])
DNA RNA bases allowed
* weights: list (default [0.280788,0.281691,0.193973,0.194773])
Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
genome (excluding N).

In [161]:
bg = base_generator()
for i in range(10):
    print (next(bg))
A
T
C
A
T
C
G
G
T
T
In [162]:
bg = base_generator(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1])
for i in range(10):
    print (next(bg))
G
T
A
G
G
G
G
A
T
G

make_sequence

In [206]:
jhelp(make_sequence, full=True)

make_sequence (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], length=1000, **kwargs)

return a sequence of DNA/RNA bases according to a probability weightning
* bases: list (default ["A","T","C","G"])
DNA RNA bases allowed in the sequence
* weights: list (default [0.280788,0.281691,0.193973,0.194773])
Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
genome (excluding N).
* length: int (default 1000)
length of the sequence to be returned

In [164]:
make_sequence()
Out[164]:
'ACTGGCGTCGGATCGTGAGGTACTGATATTTCCGGCTCGCTGCCTATACCTATCAGTCCAAGTATGATGACTAGGAAGAACGCTAGTAATAGTGGGCGTTCACGGTTGAGAACCTCTTATTCATGGAAATAAATATTGAGTCTTGTGGGTCTGATAAGCGTTCCCCAAGTAAGTACGAAAAATCTGAGAGCCAAAGGAACTACCGTTATGAGGATCTCTGTTTAAATTCTGATAATATGTATTTGGATCCGAAATACGCGGTGATGGTGTGTAGTTACCTTAGGCTGATCGGTAAGCACTGCATCTACAGTTATAGTCCCCACTTTTCGTTTGCAAGCAAAAGTTGATCTATGTCACCCTCAATCTCGTAAAGGTGTTGCTATGGTTAAAGTAAGTGTCTCCTAGTGCTGATCAGAGCAAACGCTAAGGGAAAGGGGAGCTAAGCCCTTATGATCAAAGAGACAGATGGCTTAGCGCCCAATTCAGCTATTATGTGAAATACATGTACGGGAAAAATTCTTCACTTGGAAGAAACAATGGTGAGTCTTTATCCAGGAACATGTAAGGAATTTGTAGTTCCAAATTCGGTCTATGTCCAATGATGACAGAAGCTAACGTATTGCGTTATGAATCAGGTGTACTTGTGTTTGATTTTAGTAATCCTTCGACTGAATTTGCATCTGTGGACGAGATATCACGGAGATTTGGGTGTCTCTACTTGAACATCATAGTTTGTCATAGGGCTAGTTCTTGGCATTTAATAAAATTAATAATATTGACTAATAACAACGCGACTGTTCGTCGCTAAATTGAAAACCATACAATGATCTATTTCAATACCTATTTGTCCCCACAGTAATCGATTTGCTTTATTTATAAGAGAAGATTATCAATATTTTAAGTTCTATGAATTCCTAGCACTCATAGGTCTGTGTCCCGGTGTTCCAATCTGGTGTCAACGTCGATCAGCCTTTGTCTAGTTCTTAATCTAGAGTTTAGT'
In [165]:
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[], length=100)
Out[165]:
'TATNGGATTNANGGCGTNGAATGNATNANCGTTGNNCCAAATTGANCGNTGTNNTTNGATNNTNAGGCTTGCCCTCNCGCAAAACCNGNCAACTTNNNNG'
In [166]:
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1], length=100)
Out[166]:
'ATCATGATCGNTTTTAATCAAAATTATCTTAATAAATTAATTTCTATTTTANGNAANAGATATCTNTCTTCCTNATACNCAATATAAGTTAAAACTAGGG'