diff --git a/scripts/Pre-processing.ipynb b/scripts/Pre-processing.ipynb index 9ff43f2c57c96a7e5e1a2ea79bfe552982cb0390..6b1aacb508dcd95097f748f6845e8f0c67255d62 100644 --- a/scripts/Pre-processing.ipynb +++ b/scripts/Pre-processing.ipynb @@ -23,7 +23,10 @@ "execution_count": 1, "id": "23c68491-3667-48a6-b38c-2a6092a1874a", "metadata": { - "scrolled": true + "scrolled": true, + "vscode": { + "languageId": "r" + } }, "outputs": [ { @@ -274,7 +277,11 @@ "cell_type": "code", "execution_count": 2, "id": "e28fdd55-ad64-456f-80a6-155be69b6fbd", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -328,7 +335,11 @@ "cell_type": "code", "execution_count": 3, "id": "c1bf29b6-53ca-4861-ac8c-4937203e0bd0", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "rf = EMOTE_read_features(start = 14, width = 19)" @@ -346,7 +357,11 @@ "cell_type": "code", "execution_count": 4, "id": "c2c6d370-dd19-47d0-9bdc-7e168d1d8c34", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -397,7 +412,11 @@ "cell_type": "code", "execution_count": 5, "id": "84ce999b-a51f-4dc7-b518-4468fe0fdb85", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -455,7 +474,11 @@ "cell_type": "code", "execution_count": 6, "id": "9bb4d784-8ae4-49a8-b648-8972a19f1e12", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -514,7 +537,11 @@ "cell_type": "code", "execution_count": 7, "id": "b13d7122-84f3-4ca4-a17f-0c6cb79ca173", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "rf = EMOTE_add_read_feature(rf, name = \"Recognition.seq\", start = 1, width = 3, pattern = \"AGG\" , pattern_type = 2)\n", @@ -525,7 +552,11 @@ "cell_type": "code", "execution_count": 8, "id": "a54f7041-4a64-498e-8a0b-112aa5806b68", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -605,7 +636,11 @@ "cell_type": "code", "execution_count": 9, "id": "6000958d-e2c8-437a-896a-7ddf9bcd3bf9", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -689,7 +724,11 @@ "cell_type": "code", "execution_count": 10, "id": "3b8ccb4d-98ad-487f-b577-37a09749f5ea", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { "data": { @@ -766,36 +805,36 @@ "cell_type": "code", "execution_count": 11, "id": "a42ea938-09f8-49cb-946d-eaeb53de509a", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning message in open.connection(con, \"rb\"):\n", - "“cannot open file '../data/pre-processing_examples/Example_1_valid.fastq.gz': No such file or directoryâ€\n" - ] - }, - { - "ename": "ERROR", - "evalue": "Error in open.connection(con, \"rb\"): cannot open the connection\n", - "output_type": "error", - "traceback": [ - "Error in open.connection(con, \"rb\"): cannot open the connection\nTraceback:\n", - "1. FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")", - "2. FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")", - "3. callGeneric(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", - "4. eval(call, parent.frame())", - "5. eval(call, parent.frame())", - "6. FastqStreamer(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", - "7. FastqStreamer(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", - "8. open(con, \"rb\")", - "9. open.connection(con, \"rb\")" - ] + "data": { + "text/plain": [ + "DNAStringSet object of length 4820:\n", + " width seq\n", + " [1] 19 GAAGTAGAATCAGTAATTT\n", + " [2] 19 CATGCGAGAGCGGTATTAT\n", + " [3] 19 TTCTTCCTACTAAGAACAC\n", + " [4] 19 TCCTCCGCTTATTGATATG\n", + " [5] 19 AAACGTTCAGCATTAAGTA\n", + " ... ... ...\n", + "[4816] 19 ATGTAAGTTATTTAAATAA\n", + "[4817] 19 CAAAACGTTAAGCGAATAA\n", + "[4818] 19 GCACAAAAAATGACCAAGA\n", + "[4819] 19 TCAGTCAAGCTGATTTAAA\n", + "[4820] 19 TAAAAGCTACGCACGTTTT" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")\n", + "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", "sr@sread" ] @@ -810,10 +849,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "a8ae5dcd-3eef-4c81-8d09-2520d9c0b8ad", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "BStringSet object of length 4820:\n", + " width seq\n", + " [1] 73 CCAGCGA:SRR2017586.3.1 HWI-ST865:389:HAUHUADXX:1:1101:1718:2167 length=50\n", + " [2] 73 CAAGGAA:SRR2017586.4.1 HWI-ST865:389:HAUHUADXX:1:1101:1598:2170 length=50\n", + " [3] 73 AGCGAGA:SRR2017586.7.1 HWI-ST865:389:HAUHUADXX:1:1101:1926:2194 length=50\n", + " [4] 73 AGGCGAA:SRR2017586.8.1 HWI-ST865:389:HAUHUADXX:1:1101:2348:2120 length=50\n", + " [5] 73 ACAGGAG:SRR2017586.9.1 HWI-ST865:389:HAUHUADXX:1:1101:2374:2223 length=50\n", + " ... ... ...\n", + "[4816] 77 AACAGCA:SRR2017586.9991.1 HWI-ST865:389:HAUHUADXX:1:1101:4623:12299 length=50\n", + "[4817] 77 GAAGAAA:SRR2017586.9992.1 HWI-ST865:389:HAUHUADXX:1:1101:4633:12362 length=50\n", + "[4818] 77 GCAAAAA:SRR2017586.9996.1 HWI-ST865:389:HAUHUADXX:1:1101:4779:12322 length=50\n", + "[4819] 77 CGAGGCA:SRR2017586.9999.1 HWI-ST865:389:HAUHUADXX:1:1101:5003:12298 length=50\n", + "[4820] 78 GGGCGCG:SRR2017586.10000.1 HWI-ST865:389:HAUHUADXX:1:1101:5227:12298 length=50" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "sr@id" ] @@ -845,10 +910,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "ccf94585-eff7-4fc6-b781-f4e461309d75", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DNAStringSet object of length 10000:\n", + " width seq\n", + " [1] 24 CAAGTTCAGCAGGAATGCCGAGAC\n", + " [2] 24 CAAGCAGCAGGAATGCCGAGACCG\n", + " [3] 24 CAAGGAAGTAGAATCAGTAATTTG\n", + " [4] 24 CAAGCATGCGAGAGCGGTATTATC\n", + " [5] 24 CAAGTTAAGGGCGCACGGTGGATG\n", + " ... ... ...\n", + " [9996] 24 TCGGTTAAGTTATTAAGGGCGCAC\n", + " [9997] 24 TCGGTAGGATGTTGGCTTAGAAGC\n", + " [9998] 24 TCGGTTAAGTTATTAAGGGCGCAC\n", + " [9999] 24 TCGGCAGCAGGAATGCCGAGACCG\n", + "[10000] 24 TCGGTTCAGCAGGAATGCCGAGAC" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_2.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", @@ -867,10 +958,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "d644a577-2a85-4470-b7a2-1a16b200bd37", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A EMOTE_features: 2 × 7</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>name</th><th scope=col>start</th><th scope=col>width</th><th scope=col>pattern_type</th><th scope=col>pattern</th><th scope=col>max_mismatch</th><th scope=col>readid_prepend</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><list></th><th scope=col><dbl></th><th scope=col><lgl></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>readseq</td><td>5</td><td>20</td><td>1</td><td> ACTG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>barcode</td><td>1</td><td> 4</td><td>2</td><td>TCGG, CAAG</td><td>0</td><td>FALSE</td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A EMOTE\\_features: 2 × 7\n", + "\\begin{tabular}{lllllll}\n", + " name & start & width & pattern\\_type & pattern & max\\_mismatch & readid\\_prepend\\\\\n", + " <chr> & <dbl> & <dbl> & <dbl> & <list> & <dbl> & <lgl>\\\\\n", + "\\hline\n", + "\t readseq & 5 & 20 & 1 & ACTG & 0 & FALSE\\\\\n", + "\t barcode & 1 & 4 & 2 & TCGG, CAAG & 0 & FALSE\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A EMOTE_features: 2 × 7\n", + "\n", + "| name <chr> | start <dbl> | width <dbl> | pattern_type <dbl> | pattern <list> | max_mismatch <dbl> | readid_prepend <lgl> |\n", + "|---|---|---|---|---|---|---|\n", + "| readseq | 5 | 20 | 1 | ACTG | 0 | FALSE |\n", + "| barcode | 1 | 4 | 2 | TCGG, CAAG | 0 | FALSE |\n", + "\n" + ], + "text/plain": [ + " name start width pattern_type pattern max_mismatch readid_prepend\n", + "1 readseq 5 20 1 ACTG 0 FALSE \n", + "2 barcode 1 4 2 TCGG, CAAG 0 FALSE " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "rf = EMOTE_read_features(start = 5, width = 20)\n", "rf = EMOTE_add_read_feature(rf, name = \"barcode\", start = 1, width = 4, pattern = c(\"TCGG\",\"CAAG\") , pattern_type = 2, readid_prepend = F)\n", @@ -899,10 +1039,71 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "0d9359ed-ae1c-4e2d-b005-eaf269b446b8", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Demux report already exists set force=T to overwrite: ../data/pre-processing_examples/Example_2_demux/Example_2_parse_report.csv\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A spec_tbl_df: 3 × 6</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>barcode_group</th><th scope=col>is_valid_readseq</th><th scope=col>is_valid_barcode</th><th scope=col>is_valid</th><th scope=col>pc_valid</th><th scope=col>demux_filename</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><chr></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>CAAG </td><td>4960</td><td>4977</td><td>4960</td><td>0.9965843</td><td>../data/pre-processing_examples/Example_2_demux/Example_2_CAAG_valid.fastq.gz</td></tr>\n", + "\t<tr><td>INVALID</td><td> 27</td><td> 0</td><td> 0</td><td>0.0000000</td><td>../data/pre-processing_examples/Example_2_demux/Example_2_invalid.fastq.gz </td></tr>\n", + "\t<tr><td>TCGG </td><td>4984</td><td>4996</td><td>4984</td><td>0.9975981</td><td>../data/pre-processing_examples/Example_2_demux/Example_2_TCGG_valid.fastq.gz</td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A spec\\_tbl\\_df: 3 × 6\n", + "\\begin{tabular}{llllll}\n", + " barcode\\_group & is\\_valid\\_readseq & is\\_valid\\_barcode & is\\_valid & pc\\_valid & demux\\_filename\\\\\n", + " <chr> & <dbl> & <dbl> & <dbl> & <dbl> & <chr>\\\\\n", + "\\hline\n", + "\t CAAG & 4960 & 4977 & 4960 & 0.9965843 & ../data/pre-processing\\_examples/Example\\_2\\_demux/Example\\_2\\_CAAG\\_valid.fastq.gz\\\\\n", + "\t INVALID & 27 & 0 & 0 & 0.0000000 & ../data/pre-processing\\_examples/Example\\_2\\_demux/Example\\_2\\_invalid.fastq.gz \\\\\n", + "\t TCGG & 4984 & 4996 & 4984 & 0.9975981 & ../data/pre-processing\\_examples/Example\\_2\\_demux/Example\\_2\\_TCGG\\_valid.fastq.gz\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A spec_tbl_df: 3 × 6\n", + "\n", + "| barcode_group <chr> | is_valid_readseq <dbl> | is_valid_barcode <dbl> | is_valid <dbl> | pc_valid <dbl> | demux_filename <chr> |\n", + "|---|---|---|---|---|---|\n", + "| CAAG | 4960 | 4977 | 4960 | 0.9965843 | ../data/pre-processing_examples/Example_2_demux/Example_2_CAAG_valid.fastq.gz |\n", + "| INVALID | 27 | 0 | 0 | 0.0000000 | ../data/pre-processing_examples/Example_2_demux/Example_2_invalid.fastq.gz |\n", + "| TCGG | 4984 | 4996 | 4984 | 0.9975981 | ../data/pre-processing_examples/Example_2_demux/Example_2_TCGG_valid.fastq.gz |\n", + "\n" + ], + "text/plain": [ + " barcode_group is_valid_readseq is_valid_barcode is_valid pc_valid demux_filename \n", + "1 CAAG 4960 4977 4960 0.9965843 ../data/pre-processing_examples/Example_2_demux/Example_2_CAAG_valid.fastq.gz\n", + "2 INVALID 27 0 0 0.0000000 ../data/pre-processing_examples/Example_2_demux/Example_2_invalid.fastq.gz \n", + "3 TCGG 4984 4996 4984 0.9975981 ../data/pre-processing_examples/Example_2_demux/Example_2_TCGG_valid.fastq.gz" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "EMOTE_demultiplex(\n", " fastq_file = \"../data/pre-processing_examples/Example_2.fastq.gz\",\n", @@ -931,10 +1132,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "cb6ab678-8bb7-4ef4-9168-1a6988bcc1f8", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DNAStringSet object of length 4984:\n", + " width seq\n", + " [1] 20 AGAAAAGCCAGATTTAATTA\n", + " [2] 20 TTTAAAGAATTAGATCAAAA\n", + " [3] 20 TGAATGACAATATGTCAACG\n", + " [4] 20 TTAAGTTATTAAGGGCGCAC\n", + " [5] 20 TAAGTTATTAAGGGCGCACG\n", + " ... ... ...\n", + "[4980] 20 TTAAGTTATTAAGGGCGCAC\n", + "[4981] 20 TAGGATGTTGGCTTAGAAGC\n", + "[4982] 20 TTAAGTTATTAAGGGCGCAC\n", + "[4983] 20 CAGCAGGAATGCCGAGACCG\n", + "[4984] 20 TTCAGCAGGAATGCCGAGAC" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_2_demux/Example_2_TCGG_valid.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", @@ -959,10 +1186,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "6c933c7e-0124-48d9-a4e9-11a9778da695", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DNAStringSet object of length 10000:\n", + " width seq\n", + " [1] 50 TCAGCCAGTGCACGCAAAAAAAAAAAAAAAAAAAAAAAAAAAATCGGAAG\n", + " [2] 50 TGGGGGTAGCGAGCTTGTGGGAAGCGGTCTTTGGCGATGTGGGGGTTAAA\n", + " [3] 50 TTCGAATCCTACATCTGGAGCCAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", + " [4] 50 ATTATCCAGTCCATGTCTTTGATTATTGCCATGATGTATATTGGGGCTAA\n", + " [5] 50 AATTCTTGGTGTAGGGGTAAAATCCGTAGAGATCAAGAGGAAAAAAAAAA\n", + " ... ... ...\n", + " [9996] 50 CACCGCCCGTCACACCATGGGAGTTGTGTTTGCCTTAAGTCAGGATGCTA\n", + " [9997] 50 CATTAAAGGCGAAGGAGTCTTATACAAAAAAAAAAAAAAAAAAAAAAAAA\n", + " [9998] 50 ATTGGAGTGAAGGCAAATCCACCTCTGTATTTGAAAAAAAAAAAAAAAAA\n", + " [9999] 50 AGCACCGCTATAGGAACTCAACCTATGGTTCACCTTTGCATCAGCATTGA\n", + "[10000] 50 CACACCATGGGAGTTGTGTTTGCCCCAAAAAAAAAAAAAAAAAAAAAAAA" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_3.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", @@ -980,9 +1233,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "bd2100af-f92d-4cb6-8199-6c4b945bb434", - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "r" + } + }, "outputs": [], "source": [ "rf = EMOTE_read_features(start = 1, width = 50)" @@ -1007,10 +1264,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "233819ea-b139-4950-a735-8800952aa769", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A EMOTE_features: 2 × 7</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>name</th><th scope=col>start</th><th scope=col>width</th><th scope=col>pattern_type</th><th scope=col>pattern</th><th scope=col>max_mismatch</th><th scope=col>readid_prepend</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><list></th><th scope=col><dbl></th><th scope=col><lgl></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>readseq</td><td>1</td><td>50</td><td>1</td><td> ACTG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>PolyA </td><td>1</td><td>50</td><td>3</td><td>AAAAAA.+</td><td>0</td><td>FALSE</td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A EMOTE\\_features: 2 × 7\n", + "\\begin{tabular}{lllllll}\n", + " name & start & width & pattern\\_type & pattern & max\\_mismatch & readid\\_prepend\\\\\n", + " <chr> & <dbl> & <dbl> & <dbl> & <list> & <dbl> & <lgl>\\\\\n", + "\\hline\n", + "\t readseq & 1 & 50 & 1 & ACTG & 0 & FALSE\\\\\n", + "\t PolyA & 1 & 50 & 3 & AAAAAA.+ & 0 & FALSE\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A EMOTE_features: 2 × 7\n", + "\n", + "| name <chr> | start <dbl> | width <dbl> | pattern_type <dbl> | pattern <list> | max_mismatch <dbl> | readid_prepend <lgl> |\n", + "|---|---|---|---|---|---|---|\n", + "| readseq | 1 | 50 | 1 | ACTG | 0 | FALSE |\n", + "| PolyA | 1 | 50 | 3 | AAAAAA.+ | 0 | FALSE |\n", + "\n" + ], + "text/plain": [ + " name start width pattern_type pattern max_mismatch readid_prepend\n", + "1 readseq 1 50 1 ACTG 0 FALSE \n", + "2 PolyA 1 50 3 AAAAAA.+ 0 FALSE " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "rf = EMOTE_add_read_feature(rf, name = \"PolyA\",start = 1,width = 50, pattern = \"AAAAAA.+\" , pattern_type = 3, readid_prepend = F)\n", "rf" @@ -1036,10 +1342,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "3377e5b5-be74-481c-bfa5-96e5b63d608e", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A data.frame: 2 × 5</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>group</th><th scope=col>is_valid_readseq</th><th scope=col>total_read</th><th scope=col>pc_valid</th><th scope=col>demux_filename</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><int></th><th scope=col><int></th><th scope=col><dbl></th><th scope=col><chr></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>INVALID</td><td> 0</td><td>1788</td><td>0.0000</td><td>../data/pre-processing_examples/Example_3_demux/Example_3_invalid.fastq.gz</td></tr>\n", + "\t<tr><td>VALID </td><td>8212</td><td>8212</td><td>0.8212</td><td>../data/pre-processing_examples/Example_3_demux/Example_3_valid.fastq.gz </td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A data.frame: 2 × 5\n", + "\\begin{tabular}{lllll}\n", + " group & is\\_valid\\_readseq & total\\_read & pc\\_valid & demux\\_filename\\\\\n", + " <chr> & <int> & <int> & <dbl> & <chr>\\\\\n", + "\\hline\n", + "\t INVALID & 0 & 1788 & 0.0000 & ../data/pre-processing\\_examples/Example\\_3\\_demux/Example\\_3\\_invalid.fastq.gz\\\\\n", + "\t VALID & 8212 & 8212 & 0.8212 & ../data/pre-processing\\_examples/Example\\_3\\_demux/Example\\_3\\_valid.fastq.gz \\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 2 × 5\n", + "\n", + "| group <chr> | is_valid_readseq <int> | total_read <int> | pc_valid <dbl> | demux_filename <chr> |\n", + "|---|---|---|---|---|\n", + "| INVALID | 0 | 1788 | 0.0000 | ../data/pre-processing_examples/Example_3_demux/Example_3_invalid.fastq.gz |\n", + "| VALID | 8212 | 8212 | 0.8212 | ../data/pre-processing_examples/Example_3_demux/Example_3_valid.fastq.gz |\n", + "\n" + ], + "text/plain": [ + " group is_valid_readseq total_read pc_valid demux_filename \n", + "1 INVALID 0 1788 0.0000 ../data/pre-processing_examples/Example_3_demux/Example_3_invalid.fastq.gz\n", + "2 VALID 8212 8212 0.8212 ../data/pre-processing_examples/Example_3_demux/Example_3_valid.fastq.gz " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "report = EMOTE_parse_read(\n", " fastq_file = \"../data/pre-processing_examples/Example_3.fastq.gz\",\n", @@ -1059,12 +1414,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "3ce07100-5c95-47fe-a7e3-d71729e854b3", - "metadata": {}, - "outputs": [], + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DNAStringSet object of length 8212:\n", + " width seq\n", + " [1] 50 TGGGGGTAGCGAGCTTGTGGGAAGCGGTCTTTGGCGATGTGGGGGTTAAA\n", + " [2] 22 TTCGAATCCTACATCTGGAGCC\n", + " [3] 50 ATTATCCAGTCCATGTCTTTGATTATTGCCATGATGTATATTGGGGCTAA\n", + " [4] 40 AATTCTTGGTGTAGGGGTAAAATCCGTAGAGATCAAGAGG\n", + " [5] 50 AATGGGGTGCACAAAGAGAAGCAATACTGCGAAGTGGAGCCAATCTTCAA\n", + " ... ... ...\n", + "[8208] 50 CACCGCCCGTCACACCATGGGAGTTGTGTTTGCCTTAAGTCAGGATGCTA\n", + "[8209] 25 CATTAAAGGCGAAGGAGTCTTATAC\n", + "[8210] 33 ATTGGAGTGAAGGCAAATCCACCTCTGTATTTG\n", + "[8211] 50 AGCACCGCTATAGGAACTCAACCTATGGTTCACCTTTGCATCAGCATTGA\n", + "[8212] 26 CACACCATGGGAGTTGTGTTTGCCCC" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_3_valid.fastq.gz\")\n", + "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_3_demux/Example_3_valid.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", "sr@sread" ]