diff --git a/scripts/Pre-processing.ipynb b/scripts/Pre-processing.ipynb index 4556c74ce2115a19ab680dfe7e22a4b9c4bdd9dc..9ff43f2c57c96a7e5e1a2ea79bfe552982cb0390 100644 --- a/scripts/Pre-processing.ipynb +++ b/scripts/Pre-processing.ipynb @@ -30,13 +30,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──\n", + "── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──\n", "✔ dplyr 1.1.4 ✔ readr 2.1.5\n", "✔ forcats 1.0.0 ✔ stringr 1.5.1\n", "✔ ggplot2 3.4.4 ✔ tibble 3.2.1\n", "✔ lubridate 1.9.3 ✔ tidyr 1.3.0\n", "✔ purrr 1.0.2 \n", - "── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──\n", + "── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──\n", "✖ dplyr::filter() masks stats::filter()\n", "✖ dplyr::lag() masks stats::lag()\n", "ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors\n", @@ -400,26 +400,48 @@ "metadata": {}, "outputs": [ { - "ename": "ERROR", - "evalue": "Error in `mutate()`:\nℹ In argument: `pc_valid = is_valid/total_read`.\nCaused by error:\n! object 'is_valid' not found\n", - "output_type": "error", - "traceback": [ - "Error in `mutate()`:\nℹ In argument: `pc_valid = is_valid/total_read`.\nCaused by error:\n! object 'is_valid' not found\nTraceback:\n", - "1. EMOTE_parse_read(fastq_file = \"../data/pre-processing_examples/Example_1.fastq.gz\", \n . features = rf, force = T)", - "2. mutate(stat_tb, pc_valid = is_valid/total_read) %>% mutate(demux_filename = paste0(out_dir, \n . \"/\", fq_basename, \"_\", tolower(group), \".fastq.gz\"))", - "3. mutate(., demux_filename = paste0(out_dir, \"/\", fq_basename, \n . \"_\", tolower(group), \".fastq.gz\"))", - "4. mutate(stat_tb, pc_valid = is_valid/total_read)", - "5. mutate.data.frame(stat_tb, pc_valid = is_valid/total_read)", - "6. mutate_cols(.data, dplyr_quosures(...), by)", - "7. withCallingHandlers(for (i in seq_along(dots)) {\n . poke_error_context(dots, i, mask = mask)\n . context_poke(\"column\", old_current_column)\n . new_columns <- mutate_col(dots[[i]], data, mask, new_columns)\n . }, error = dplyr_error_handler(dots = dots, mask = mask, bullets = mutate_bullets, \n . error_call = error_call, error_class = \"dplyr:::mutate_error\"), \n . warning = dplyr_warning_handler(state = warnings_state, mask = mask, \n . error_call = error_call))", - "8. mutate_col(dots[[i]], data, mask, new_columns)", - "9. mask$eval_all_mutate(quo)", - "10. eval()", - "11. .handleSimpleError(function (cnd) \n . {\n . local_error_context(dots, i = frame[[i_sym]], mask = mask)\n . if (inherits(cnd, \"dplyr:::internal_error\")) {\n . parent <- error_cnd(message = bullets(cnd))\n . }\n . else {\n . parent <- cnd\n . }\n . message <- c(cnd_bullet_header(action), i = if (has_active_group_context(mask)) cnd_bullet_cur_group_label())\n . abort(message, class = error_class, parent = parent, call = error_call)\n . }, \"object 'is_valid' not found\", base::quote(NULL))", - "12. h(simpleError(msg, call))", - "13. abort(message, class = error_class, parent = parent, call = error_call)", - "14. signal_abort(cnd, .file)" - ] + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A data.frame: 2 × 5</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>group</th><th scope=col>is_valid_readseq</th><th scope=col>total_read</th><th scope=col>pc_valid</th><th scope=col>demux_filename</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><int></th><th scope=col><int></th><th scope=col><dbl></th><th scope=col><chr></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>INVALID</td><td> 0</td><td> 17</td><td>0.0000</td><td>../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz</td></tr>\n", + "\t<tr><td>VALID </td><td>9983</td><td>9983</td><td>0.9983</td><td>../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz </td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A data.frame: 2 × 5\n", + "\\begin{tabular}{lllll}\n", + " group & is\\_valid\\_readseq & total\\_read & pc\\_valid & demux\\_filename\\\\\n", + " <chr> & <int> & <int> & <dbl> & <chr>\\\\\n", + "\\hline\n", + "\t INVALID & 0 & 17 & 0.0000 & ../data/pre-processing\\_examples/Example\\_1\\_demux/Example\\_1\\_invalid.fastq.gz\\\\\n", + "\t VALID & 9983 & 9983 & 0.9983 & ../data/pre-processing\\_examples/Example\\_1\\_demux/Example\\_1\\_valid.fastq.gz \\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 2 × 5\n", + "\n", + "| group <chr> | is_valid_readseq <int> | total_read <int> | pc_valid <dbl> | demux_filename <chr> |\n", + "|---|---|---|---|---|\n", + "| INVALID | 0 | 17 | 0.0000 | ../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz |\n", + "| VALID | 9983 | 9983 | 0.9983 | ../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz |\n", + "\n" + ], + "text/plain": [ + " group is_valid_readseq total_read pc_valid demux_filename \n", + "1 INVALID 0 17 0.0000 ../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz\n", + "2 VALID 9983 9983 0.9983 ../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz " + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -431,12 +453,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "9bb4d784-8ae4-49a8-b648-8972a19f1e12", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DNAStringSet object of length 9983:\n", + " width seq\n", + " [1] 19 TTCAGCAGGAATGCCGAGA\n", + " [2] 19 CAGCAGGAATGCCGAGACC\n", + " [3] 19 GAAGTAGAATCAGTAATTT\n", + " [4] 19 CATGCGAGAGCGGTATTAT\n", + " [5] 19 TTAAGGGCGCACGGTGGAT\n", + " ... ... ...\n", + "[9979] 19 GCACAAAAAATGACCAAGA\n", + "[9980] 19 CCGCTCTTCCGATCTTAAG\n", + "[9981] 19 TATTGATATGCAAGATGAA\n", + "[9982] 19 TCAGTCAAGCTGATTTAAA\n", + "[9983] 19 TAAAAGCTACGCACGTTTT" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")\n", + "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz\")\n", "sr <- yield(fq_streamer)\n", "sr@sread" ] @@ -468,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "b13d7122-84f3-4ca4-a17f-0c6cb79ca173", "metadata": {}, "outputs": [], @@ -479,10 +523,59 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "a54f7041-4a64-498e-8a0b-112aa5806b68", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A EMOTE_features: 3 × 7</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>name</th><th scope=col>start</th><th scope=col>width</th><th scope=col>pattern_type</th><th scope=col>pattern</th><th scope=col>max_mismatch</th><th scope=col>readid_prepend</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><list></th><th scope=col><dbl></th><th scope=col><lgl></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>readseq </td><td>14</td><td>19</td><td>1</td><td>ACTG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>Recognition.seq</td><td> 1</td><td> 3</td><td>2</td><td> AGG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>Control.seq </td><td>11</td><td> 3</td><td>2</td><td> CGC</td><td>0</td><td>FALSE</td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A EMOTE\\_features: 3 × 7\n", + "\\begin{tabular}{lllllll}\n", + " name & start & width & pattern\\_type & pattern & max\\_mismatch & readid\\_prepend\\\\\n", + " <chr> & <dbl> & <dbl> & <dbl> & <list> & <dbl> & <lgl>\\\\\n", + "\\hline\n", + "\t readseq & 14 & 19 & 1 & ACTG & 0 & FALSE\\\\\n", + "\t Recognition.seq & 1 & 3 & 2 & AGG & 0 & FALSE\\\\\n", + "\t Control.seq & 11 & 3 & 2 & CGC & 0 & FALSE\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A EMOTE_features: 3 × 7\n", + "\n", + "| name <chr> | start <dbl> | width <dbl> | pattern_type <dbl> | pattern <list> | max_mismatch <dbl> | readid_prepend <lgl> |\n", + "|---|---|---|---|---|---|---|\n", + "| readseq | 14 | 19 | 1 | ACTG | 0 | FALSE |\n", + "| Recognition.seq | 1 | 3 | 2 | AGG | 0 | FALSE |\n", + "| Control.seq | 11 | 3 | 2 | CGC | 0 | FALSE |\n", + "\n" + ], + "text/plain": [ + " name start width pattern_type pattern max_mismatch readid_prepend\n", + "1 readseq 14 19 1 ACTG 0 FALSE \n", + "2 Recognition.seq 1 3 2 AGG 0 FALSE \n", + "3 Control.seq 11 3 2 CGC 0 FALSE " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "rf" ] @@ -510,10 +603,63 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "6000958d-e2c8-437a-896a-7ddf9bcd3bf9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A EMOTE_features: 4 × 7</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>name</th><th scope=col>start</th><th scope=col>width</th><th scope=col>pattern_type</th><th scope=col>pattern</th><th scope=col>max_mismatch</th><th scope=col>readid_prepend</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><dbl></th><th scope=col><list></th><th scope=col><dbl></th><th scope=col><lgl></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>readseq </td><td>14</td><td>19</td><td>1</td><td>ACTG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>Recognition.seq</td><td> 1</td><td> 3</td><td>2</td><td> AGG</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>Control.seq </td><td>11</td><td> 3</td><td>2</td><td> CGC</td><td>0</td><td>FALSE</td></tr>\n", + "\t<tr><td>UMI </td><td> 4</td><td> 7</td><td>1</td><td> ACG</td><td>0</td><td> TRUE</td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A EMOTE\\_features: 4 × 7\n", + "\\begin{tabular}{lllllll}\n", + " name & start & width & pattern\\_type & pattern & max\\_mismatch & readid\\_prepend\\\\\n", + " <chr> & <dbl> & <dbl> & <dbl> & <list> & <dbl> & <lgl>\\\\\n", + "\\hline\n", + "\t readseq & 14 & 19 & 1 & ACTG & 0 & FALSE\\\\\n", + "\t Recognition.seq & 1 & 3 & 2 & AGG & 0 & FALSE\\\\\n", + "\t Control.seq & 11 & 3 & 2 & CGC & 0 & FALSE\\\\\n", + "\t UMI & 4 & 7 & 1 & ACG & 0 & TRUE\\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A EMOTE_features: 4 × 7\n", + "\n", + "| name <chr> | start <dbl> | width <dbl> | pattern_type <dbl> | pattern <list> | max_mismatch <dbl> | readid_prepend <lgl> |\n", + "|---|---|---|---|---|---|---|\n", + "| readseq | 14 | 19 | 1 | ACTG | 0 | FALSE |\n", + "| Recognition.seq | 1 | 3 | 2 | AGG | 0 | FALSE |\n", + "| Control.seq | 11 | 3 | 2 | CGC | 0 | FALSE |\n", + "| UMI | 4 | 7 | 1 | ACG | 0 | TRUE |\n", + "\n" + ], + "text/plain": [ + " name start width pattern_type pattern max_mismatch readid_prepend\n", + "1 readseq 14 19 1 ACTG 0 FALSE \n", + "2 Recognition.seq 1 3 2 AGG 0 FALSE \n", + "3 Control.seq 11 3 2 CGC 0 FALSE \n", + "4 UMI 4 7 1 ACG 0 TRUE " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "rf = EMOTE_add_read_feature(rf, name = \"UMI\", start = 4, width = 7, pattern = \"ACG\", pattern_type = 1, readid_prepend = T)\n", "\n", @@ -541,10 +687,55 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "3b8ccb4d-98ad-487f-b577-37a09749f5ea", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<table class=\"dataframe\">\n", + "<caption>A data.frame: 2 × 8</caption>\n", + "<thead>\n", + "\t<tr><th scope=col>group</th><th scope=col>is_valid_readseq</th><th scope=col>is_valid_Recognition.seq</th><th scope=col>is_valid_Control.seq</th><th scope=col>is_valid_UMI</th><th scope=col>total_read</th><th scope=col>pc_valid</th><th scope=col>demux_filename</th></tr>\n", + "\t<tr><th scope=col><chr></th><th scope=col><int></th><th scope=col><int></th><th scope=col><int></th><th scope=col><int></th><th scope=col><int></th><th scope=col><dbl></th><th scope=col><chr></th></tr>\n", + "</thead>\n", + "<tbody>\n", + "\t<tr><td>INVALID</td><td>5163</td><td>5127</td><td> 93</td><td>4355</td><td>5180</td><td>0.000</td><td>../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz</td></tr>\n", + "\t<tr><td>VALID </td><td>4820</td><td>4820</td><td>4820</td><td>4820</td><td>4820</td><td>0.482</td><td>../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz </td></tr>\n", + "</tbody>\n", + "</table>\n" + ], + "text/latex": [ + "A data.frame: 2 × 8\n", + "\\begin{tabular}{llllllll}\n", + " group & is\\_valid\\_readseq & is\\_valid\\_Recognition.seq & is\\_valid\\_Control.seq & is\\_valid\\_UMI & total\\_read & pc\\_valid & demux\\_filename\\\\\n", + " <chr> & <int> & <int> & <int> & <int> & <int> & <dbl> & <chr>\\\\\n", + "\\hline\n", + "\t INVALID & 5163 & 5127 & 93 & 4355 & 5180 & 0.000 & ../data/pre-processing\\_examples/Example\\_1\\_demux/Example\\_1\\_invalid.fastq.gz\\\\\n", + "\t VALID & 4820 & 4820 & 4820 & 4820 & 4820 & 0.482 & ../data/pre-processing\\_examples/Example\\_1\\_demux/Example\\_1\\_valid.fastq.gz \\\\\n", + "\\end{tabular}\n" + ], + "text/markdown": [ + "\n", + "A data.frame: 2 × 8\n", + "\n", + "| group <chr> | is_valid_readseq <int> | is_valid_Recognition.seq <int> | is_valid_Control.seq <int> | is_valid_UMI <int> | total_read <int> | pc_valid <dbl> | demux_filename <chr> |\n", + "|---|---|---|---|---|---|---|---|\n", + "| INVALID | 5163 | 5127 | 93 | 4355 | 5180 | 0.000 | ../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz |\n", + "| VALID | 4820 | 4820 | 4820 | 4820 | 4820 | 0.482 | ../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz |\n", + "\n" + ], + "text/plain": [ + " group is_valid_readseq is_valid_Recognition.seq is_valid_Control.seq is_valid_UMI total_read pc_valid demux_filename \n", + "1 INVALID 5163 5127 93 4355 5180 0.000 ../data/pre-processing_examples/Example_1_demux/Example_1_invalid.fastq.gz\n", + "2 VALID 4820 4820 4820 4820 4820 0.482 ../data/pre-processing_examples/Example_1_demux/Example_1_valid.fastq.gz " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "report = EMOTE_parse_read(\n", " fastq_file = \"../data/pre-processing_examples/Example_1.fastq.gz\",\n", @@ -573,10 +764,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "a42ea938-09f8-49cb-946d-eaeb53de509a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning message in open.connection(con, \"rb\"):\n", + "“cannot open file '../data/pre-processing_examples/Example_1_valid.fastq.gz': No such file or directoryâ€\n" + ] + }, + { + "ename": "ERROR", + "evalue": "Error in open.connection(con, \"rb\"): cannot open the connection\n", + "output_type": "error", + "traceback": [ + "Error in open.connection(con, \"rb\"): cannot open the connection\nTraceback:\n", + "1. FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")", + "2. FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")", + "3. callGeneric(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", + "4. eval(call, parent.frame())", + "5. eval(call, parent.frame())", + "6. FastqStreamer(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", + "7. FastqStreamer(con, n = 1e+06, readerBlockSize = readerBlockSize, \n . verbose = verbose)", + "8. open(con, \"rb\")", + "9. open.connection(con, \"rb\")" + ] + } + ], "source": [ "fq_streamer = FastqStreamer(\"../data/pre-processing_examples/Example_1_valid.fastq.gz\")\n", "sr <- yield(fq_streamer)\n",