From 97ad43f5d9e7cf576e7dace17856959626c323b7 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 15:13:28 +0200 Subject: [PATCH 01/35] fix agenda styling --- topics/epigenetics/tutorials/methylation-seq/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/epigenetics/tutorials/methylation-seq/tutorial.md b/topics/epigenetics/tutorials/methylation-seq/tutorial.md index c2609b3e..b0e87fd7 100644 --- a/topics/epigenetics/tutorials/methylation-seq/tutorial.md +++ b/topics/epigenetics/tutorials/methylation-seq/tutorial.md @@ -19,7 +19,7 @@ tutorial_name: methylation-seq > > The data we use in this tutorial is available at [Zenodo](https://zenodo.org/record/557099). > -> {: .agenda} +{: .agenda} # Load data and quality control From 038cea14fd027d0fcfd35766b9d83d60cec9b94b Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 15:46:07 +0200 Subject: [PATCH 02/35] fix links in metagenomics topi --- topics/metagenomics/tutorials/general-tutorial/tutorial.md | 4 ++-- topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/topics/metagenomics/tutorials/general-tutorial/tutorial.md b/topics/metagenomics/tutorials/general-tutorial/tutorial.md index c5ae5d3a..5e8df9e1 100644 --- a/topics/metagenomics/tutorials/general-tutorial/tutorial.md +++ b/topics/metagenomics/tutorials/general-tutorial/tutorial.md @@ -178,7 +178,7 @@ The first step in any analysis should be to check and improve the quality of our > ### :nut_and_bolt: Comment > -> For more information on the topic of quality control, please see our training materials [here](https://galaxyproject.github.io/training-material/NGS-QC/). +> For more information on the topic of quality control, please see our training materials [here](https://galaxyproject.github.io/training-material/topics/sequence-analysis/). {: .comment} @@ -466,7 +466,7 @@ To further explore the community structure, we can visualize it with dedicated t > ![](../../../../shared/images/viewatphinch.png) > > 2. Click on the icon -> +> > It will lead you to the Phinch website, which will automatically load in your file, and where you can several interactive visualisations: > > ![](../../../../shared/images/phinch_overviewpage.png) diff --git a/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.md b/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.md index a677471d..643f777b 100644 --- a/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.md +++ b/topics/metagenomics/tutorials/mothur-miseq-sop/tutorial.md @@ -221,7 +221,7 @@ Here the first column contains the read name, and the second column contains the ### Data Cleaning For more information on the topic of quality control, please see our training materials -[here](https://galaxyproject.github.io/training-material/NGS-QC/) +[here](https://galaxyproject.github.io/training-material/topics/sequence-analysis/) Next we want to improve the quality of our data. But first, let's get a feel of our data @@ -347,7 +347,7 @@ the number of duplicates of this sequence observed in each sample. ### Sequence Alignment For more information on the topic of alignment, please see our training materials -[here](https://galaxyproject.github.io/training-material/NGS-mapping/) +[here](https://galaxyproject.github.io/training-material/topics/sequence-analysis/) We are now ready to align our sequences to the reference. This step is an important step to perform to improve the clustering of your OTUs [[Schloss 2013]](https://doi.org/10.1038/ismej.2012.102) From 0de88a1a01e405baba821a450dacff44f1a1ca77 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 15:49:53 +0200 Subject: [PATCH 03/35] re-enable travis link checking --- .travis.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7572baa5..733b3765 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,8 @@ before_script: script: - set -e # Check links - #- find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 401,403,405,500 --whitelist http://localhost:8080 {}" + - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 401,403,405,500 --whitelist http://localhost:8080 {}" + # # Check structure #- ./bin/check_structure.py # Check docker @@ -38,11 +39,10 @@ script: # echo "" # done < DOCKER_BUILDS.list # Once all tutorials use new docker bootstrap script use this - - | - while read -r DIR - do - echo "$DIR" - docker build -f "$DIR/Dockerfile" . - echo "" - done < DOCKER_BUILDS.list - + #- | + # while read -r DIR + # do + # echo "$DIR" + # docker build -f "$DIR/Dockerfile" . + # echo "" + # done < DOCKER_BUILDS.list From 0b79d1b58d6b8bca2b7875108df2e68ba22f2eab Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 15:53:51 +0200 Subject: [PATCH 04/35] don't allow status codes 403 (access denied), 500 (internal server error), 401 (unauthorized access) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 733b3765..4397f7ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ before_script: script: - set -e # Check links - - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 401,403,405,500 --whitelist http://localhost:8080 {}" + - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist http://localhost:8080 {}" # # Check structure #- ./bin/check_structure.py From 30a5b62d4d836c24a8a0feb48443ac7bd50fc471 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:24:41 +0200 Subject: [PATCH 05/35] fix links in admin topic --- topics/admin/tutorials/advanced-galaxy-customisation/slides.html | 6 ++---- topics/admin/tutorials/database-schema/tutorial.md | 3 ++- topics/admin/tutorials/dev-to-production/tutorial.md | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/topics/admin/tutorials/advanced-galaxy-customisation/slides.html b/topics/admin/tutorials/advanced-galaxy-customisation/slides.html index b238d6bc..32997edb 100644 --- a/topics/admin/tutorials/advanced-galaxy-customisation/slides.html +++ b/topics/admin/tutorials/advanced-galaxy-customisation/slides.html @@ -228,7 +228,7 @@ ### Galaxy Webhooks -Have their own [slides](https://galaxyproject.github.io/training-material//topics/dev//tutorials/webhooks/slides.html#1) and [tutorial](https://galaxyproject.github.io/training-material//topics/dev//tutorials/webhooks/tutorial.md). +Have their own [slides](https://galaxyproject.github.io/training-material//topics/dev//tutorials/webhooks/slides.html#1) and [tutorial](https://galaxyproject.github.io/training-material//topics/dev//tutorials/webhooks/tutorial.html). And also a [documentation](https://docs.galaxyproject.org/en/master/admin/special_topics/webhooks.html) @@ -257,7 +257,7 @@ -- -* Embedding [twitter feeds](https://publish.twitter.com) is popular, e.g. [usegalaxy.org](https://usegalaxy.org) +* Embedding [twitter feeds](http://publish.twitter.com) is popular, e.g. [usegalaxy.org](https://usegalaxy.org) -- @@ -351,5 +351,3 @@ * Create a quota that adds 1GB * Create a quota that decreases by 1GB * Create a quota that grants unlimited - - diff --git a/topics/admin/tutorials/database-schema/tutorial.md b/topics/admin/tutorials/database-schema/tutorial.md index 0c24eae5..e2b4d1f1 100644 --- a/topics/admin/tutorials/database-schema/tutorial.md +++ b/topics/admin/tutorials/database-schema/tutorial.md @@ -57,7 +57,8 @@ What’s not in the database is the data. Datasets are stored outside the databa Entity-relationship diagrams are a way to understand tables and the relationships between them inside a relational database. SchemaSpy (http://schemaspy.sourceforge.net/) is a free (and remarkable tool) for generating ER diagrams. We’be used it generate a description of the database backing the server in this container. See - https://galaxyproject.org/schema/SchemaSpy/index.html + + https://galaxyproject.org /schema/SchemaSpy/index.html The “Tables” tab is a good place to start learning the structure of the database. Each table represents a different type of thing, and often that thing is itself a relationship. For example, each record in the dataset table has information about a specific dataset, while records in the history_dataset_association table have information about what histories that dataset is in. diff --git a/topics/admin/tutorials/dev-to-production/tutorial.md b/topics/admin/tutorials/dev-to-production/tutorial.md index 9d6e47f4..f91b24d7 100644 --- a/topics/admin/tutorials/dev-to-production/tutorial.md +++ b/topics/admin/tutorials/dev-to-production/tutorial.md @@ -20,7 +20,7 @@ Move from dev instance to production instance :heavy_check_mark: ***Requirements*** -- *[Galaxy Server Administration](http://galaxyproject.github.io/training-material/Admin-Corner/slides/index.html)* +- *[Galaxy Server Administration](http://galaxyproject.github.io/training-material/topics/admin/slides/)* :hourglass: ***Time estimation*** *TODO* From 6aea58b1864d928307da0b621993170593a38b1c Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:28:05 +0200 Subject: [PATCH 06/35] fix links in transcriptomics topic --- topics/transcriptomics/README.md | 4 ++-- topics/transcriptomics/tutorials/de-novo/tutorial.md | 8 ++++---- topics/transcriptomics/tutorials/srna/tutorial.md | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/topics/transcriptomics/README.md b/topics/transcriptomics/README.md index c1641edc..317a678d 100644 --- a/topics/transcriptomics/README.md +++ b/topics/transcriptomics/README.md @@ -7,13 +7,13 @@ RNA-sequencing is a method used to reveal the presence and quantity of RNA in a A deck of slides is available for this topic: -- [General introduction about RNA seq data analysis](http://galaxyproject.github.io/training-material/RNA-Seq/slides/) +- [General introduction about RNA seq data analysis](http://galaxyproject.github.io/training-material/topics/transcriptomics/slides/) # Tutorials A tutorial with hands-on is available for this topic: -- [Reference-based RNA-seq data analysis](http://galaxyproject.github.io/training-material//RNA-Seq/tutorials/ref_based) +- [Reference-based RNA-seq data analysis](http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html) ## Input datasets diff --git a/topics/transcriptomics/tutorials/de-novo/tutorial.md b/topics/transcriptomics/tutorials/de-novo/tutorial.md index 0286590e..ed9665b2 100644 --- a/topics/transcriptomics/tutorials/de-novo/tutorial.md +++ b/topics/transcriptomics/tutorials/de-novo/tutorial.md @@ -42,7 +42,7 @@ Due to the large size of this dataset, we have downsampled it to only include re > 9. Rename the files in your history to retain just the necessary information (*e.g.* "G1E R1 forward reads") > > > ### :bulb: Tip: Importing data via links -> > Data available from zenodo: [![DOI](https://zenodo.org/badge/DOI/10.123/GTNdenovoRNAseq.svg)](https://doi.org/10.123/GTNdenovoRNAseq) +> > Data available from zenodo: [![DOI](https://zenodo.org/badge/DOI/10.123/GTNdenovoRNAseq.svg)](https://zenodo.org/record/583140) > > > > Below are the links to the read files that can be copied and pasted in the upload manager. > > - https://zenodo.org/record/583140/files/G1E_rep1_forward_read_%28SRR549355_1%29 @@ -208,7 +208,7 @@ We now want to identify which transcripts are differentially expressed between t ## Count the number of reads per transcript -To compare the abundance of transcripts between different cellular states, the first essential step is to quantify the number of reads per transcript. [`FeatureCounts`](http://bioinf.wehi.edu.au/featureCounts/) is one of the most popular tools for counting reads in genomic features. In our case, we'll be using `FeatureCounts` to count reads aligning in exons of our `GFFCompare` generated transcriptome database. +To compare the abundance of transcripts between different cellular states, the first essential step is to quantify the number of reads per transcript. [`FeatureCounts`](http://bioinf.wehi.edu.au/featureCounts/ ) is one of the most popular tools for counting reads in genomic features. In our case, we'll be using `FeatureCounts` to count reads aligning in exons of our `GFFCompare` generated transcriptome database. The recommended mode is "union", which counts overlaps even if a read only shares parts of its sequence with a genomic feature and disregards reads that overlap more than one feature. @@ -261,7 +261,7 @@ The first output of `DESeq2` is a tabular file. The columns are: 4. Standard error estimate for the log2 fold change estimate 5. [Wald](https://data.princeton.edu/wws509/notes/c2s3.html) statistic 6. *p*-value for the statistical significance of this change -7. *p*-value adjusted for multiple testing with the Benjamini-Hochberg procedure which controls false discovery rate ([FDR](https://www.biostathandbook.com/multiplecomparisons.html)) +7. *p*-value adjusted for multiple testing with the Benjamini-Hochberg procedure which controls false discovery rate ([FDR](http://www.biostathandbook.com/multiplecomparisons.html)) > ### :pencil2: Hands-on: @@ -403,4 +403,4 @@ In this tutorial, we have analyzed real RNA sequencing data to extract useful in > # Workflow -> This analysis pipeline can be recreated using the workflow here: [https://tinyurl.com/GTNdenovoRNAseqWorkflow](https://tinyurl.com/GTNdenovoRNAseqWorkflow) +> This analysis pipeline can be recreated using the workflow here: [https://tinyurl.com/GTNdenovoRNAseqWorkflow ](https://tinyurl.com/GTNdenovoRNAseqWorkflow) diff --git a/topics/transcriptomics/tutorials/srna/tutorial.md b/topics/transcriptomics/tutorials/srna/tutorial.md index e69774ce..83c3d058 100644 --- a/topics/transcriptomics/tutorials/srna/tutorial.md +++ b/topics/transcriptomics/tutorials/srna/tutorial.md @@ -34,7 +34,9 @@ It is of note that this tutorial uses datasets that have been de-multiplexed so ## Data upload and organization -Due to the large size of the original sRNA-seq datasets, we have downsampled them to only inlcude a subset of usable reads. These datasets are avaialble at [`Zenodo`](https://zenodo.org/record/####), where you can find the FASTQ files corresponding to replicate sRNA-seq and mRNA-seq libraries and additiona annotation files for the *Drosophila melanogaster* genome version dm3. + + +Due to the large size of the original sRNA-seq datasets, we have downsampled them to only inlcude a subset of usable reads. These datasets are avaialble at [`Zenodo`](zenodo.org /record/####), where you can find the FASTQ files corresponding to replicate sRNA-seq and mRNA-seq libraries and additiona annotation files for the *Drosophila melanogaster* genome version dm3. > ### :pencil2: Hands-on: Data upload and organization > From 534fbf817c9705be45dc2d07529bda42362c3051 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:28:39 +0200 Subject: [PATCH 07/35] fix links in epigenetics topic --- topics/epigenetics/slides/index.html | 6 +++--- topics/epigenetics/tutorials/methylation-seq/tutorial.md | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/topics/epigenetics/slides/index.html b/topics/epigenetics/slides/index.html index 07c4271a..bb908fda 100644 --- a/topics/epigenetics/slides/index.html +++ b/topics/epigenetics/slides/index.html @@ -52,7 +52,7 @@ --- -# [Hands-on](http://galaxyproject.github.io/training-material/MethylC-Seq/tutorials/methylation-seq.html) +# [Hands-on](https://galaxyproject.github.io/training-material/topics/epigenetics/tutorials/methylation-seq/tutorial.html) ##Step number 1: load data and run FastQC --- @@ -78,7 +78,7 @@ --- -# [Hands-on](http://galaxyproject.github.io/training-material/MethylC-Seq/tutorials/methylation-seq.html) +# [Hands-on](https://galaxyproject.github.io/training-material/topics/epigenetics/tutorials/methylation-seq/tutorial.html) ##Step number 2: Alignment and methylation bias --- @@ -95,7 +95,7 @@ --- -# [Hands-on](http://galaxyproject.github.io/training-material/MethylC-Seq/tutorials/methylation-seq.html) +# [Hands-on](https://galaxyproject.github.io/training-material/topics/epigenetics/tutorials/methylation-seq/tutorial.html) ##Step number 3: Visualization and more --- diff --git a/topics/epigenetics/tutorials/methylation-seq/tutorial.md b/topics/epigenetics/tutorials/methylation-seq/tutorial.md index b0e87fd7..fd4ebd2b 100644 --- a/topics/epigenetics/tutorials/methylation-seq/tutorial.md +++ b/topics/epigenetics/tutorials/methylation-seq/tutorial.md @@ -15,7 +15,7 @@ tutorial_name: methylation-seq > 5. [Metilene](#metilene) > > -> This tutorial is based on [I-Hsuan Lin et al.: 'Hierarchical Clustering of Breast Cancer Methylomes Revealed Differentially Methylated and Expressed Breast Cancer Genes'](http://dx.doi.org/10.1371/journal.pone.0118453). +> This tutorial is based on [I-Hsuan Lin et al.: 'Hierarchical Clustering of Breast Cancer Methylomes Revealed Differentially Methylated and Expressed Breast Cancer Genes'](https://dx.doi.org/10.1371/journal.pone.0118453). > > The data we use in this tutorial is available at [Zenodo](https://zenodo.org/record/557099). > From e4e462010db3694375b8b2469a703530e670cac8 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:30:23 +0200 Subject: [PATCH 08/35] fix links in chip-seq topic --- topics/chip-seq/tutorials/chip-seq/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/chip-seq/tutorials/chip-seq/tutorial.md b/topics/chip-seq/tutorials/chip-seq/tutorial.md index 794cf01c..e4cf1b52 100644 --- a/topics/chip-seq/tutorials/chip-seq/tutorial.md +++ b/topics/chip-seq/tutorials/chip-seq/tutorial.md @@ -212,7 +212,7 @@ Use the regions provided by the gene annotation file downloaded from UCSC and yo **Leleu et al. (2010):** [Processing and analyzing ChIP-seq data](http://www.ncbi.nlm.nih.gov/pubmed/20861161), (doi: 10.1093/bfgp/elq022) - Fairly detailed review of key concepts of ChIP-seq data processing (less detailed on analysis) -**Peter Park (2009):** [ChIP-seq: Advantages and challenges of a maturing technology](http://www.nature.com/nrg/journal/v10/n10/full/nrg2641.html), (doi:10.1038/nrg2641) +**Peter Park (2009):** [ChIP-seq: Advantages and challenges of a maturing technology](https://www.ncbi.nlm.nih.gov/pubmed/19736561), (doi:10.1038/nrg2641) **Kharchenko et al. (2008):** [Design and analysis of ChIP-seq experiments for DNA-binding proteins](http://www.ncbi.nlm.nih.gov/pubmed/19029915), (doi:10.1038/nbt.1508) From 0f84cd6d8d5b45b1064b84b99093ea719b65ec91 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:33:29 +0200 Subject: [PATCH 09/35] fix links in introduction topic --- topics/introduction/README.md | 2 +- .../tutorials/options-for-using-galaxy/slides.html | 6 +-- .../processing-many-samples-at-once/tutorial.md | 54 +++++++++++----------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/topics/introduction/README.md b/topics/introduction/README.md index c6a18faf..4aac5be2 100644 --- a/topics/introduction/README.md +++ b/topics/introduction/README.md @@ -9,7 +9,7 @@ Here, you will find some material to learn how to use Galaxy. A deck of slides is available for this topic: -- [General introduction about Galaxy](http://galaxyproject.github.io/training-material/Introduction/slides/) +- [General introduction about Galaxy](http://galaxyproject.github.io/training-material/topics/introduction/slides/) # Tutorials diff --git a/topics/introduction/tutorials/options-for-using-galaxy/slides.html b/topics/introduction/tutorials/options-for-using-galaxy/slides.html index 06532c20..80795ff7 100644 --- a/topics/introduction/tutorials/options-for-using-galaxy/slides.html +++ b/topics/introduction/tutorials/options-for-using-galaxy/slides.html @@ -58,7 +58,7 @@ 1. Tool servers - Host specific tools to make them easy to access and run -Complete list: [bit.ly/gxyServers](https://bit.ly/gxyServers) +Complete list: [bit.ly/gxyServers ](https://bit.ly/gxyServers) --- @@ -73,7 +73,7 @@ | Norway | [Norwegian e-Infrastructure for Life Sciences (NeLS)](https://nels.bioinfo.no/) | | US | [Jetstream](https://galaxyproject.org/cloud/jetstream/) | -Complete list: [bit.ly/gxysemipublic](http://bit.ly/gxysemipublic) +Complete list: [bit.ly/gxysemipublic ](http://bit.ly/gxysemipublic) --- @@ -139,7 +139,7 @@ ### Commercial support -- You can buy a [preconfigured Galaxy server](https://bioteam.net/products/galaxy-appliance/) from [BioTeam](https://bioteam.net) +- You can buy a [preconfigured Galaxy server](http://bioteam.net/products/galaxy-appliance/) from [BioTeam](http://bioteam.net/) - [Globus Genomics](http://globusgenomics.org/) and [GenomeCloud](http://www.genome-cloud.com/) provide cloud-based Galaxy servers - [Commercially provided consulting and training](https://galaxyproject.org/support/commercial/) are also available diff --git a/topics/introduction/tutorials/processing-many-samples-at-once/tutorial.md b/topics/introduction/tutorials/processing-many-samples-at-once/tutorial.md index d477c800..486fd35b 100644 --- a/topics/introduction/tutorials/processing-many-samples-at-once/tutorial.md +++ b/topics/introduction/tutorials/processing-many-samples-at-once/tutorial.md @@ -34,49 +34,49 @@ These datasets represent genomic DNA (enriched for mitochondria via a long range If you imported [history]( https://test.galaxyproject.org/u/anton/h/collections-1) as described [above](https://github.com/nekrut/galaxy/wiki/Processing-many-samples-at-once#0-getting-data), your screen will look something like this: -![manyDatasets](https://galaxyproject.org/galaxy101/manyDatasets.png) +![manyDatasets](https://galaxyproject.org/tutorials/collections/manyDatasets.png) -Now click the checkbox in ![HistioryItemControls](https://galaxyproject.org/galaxy101/historyItemControls.png) and you will see your history changing like this: +Now click the checkbox in ![HistioryItemControls](https://galaxyproject.org/tutorials/collections/historyItemControls.png) and you will see your history changing like this: -![historyWithCheckboxes](https://galaxyproject.org/galaxy101/historyWithCheckboxes.png) +![historyWithCheckboxes](https://galaxyproject.org/tutorials/collections/historyWithCheckboxes.png) -Let's click ![all](https://galaxyproject.org/galaxy101/all.png), which will select all datasets in the history, then click ![allSelected](https://galaxyproject.org/galaxy101/forAllSelected.png) and finally select **Build List of Dataset Pairs** from the following menu: +Let's click `All`, which will select all datasets in the history, then click ![allSelected](https://galaxyproject.org/tutorials/collections/forAllSelected.png) and finally select **Build List of Dataset Pairs** from the following menu: -![buildPairs](https://galaxyproject.org/galaxy101/buildPairs.png) +![buildPairs](https://galaxyproject.org/tutorials/collections/buildPairs.png) The following wizard will appear: -![collectionCreation](https://galaxyproject.org/galaxy101/collectionCreation.png) +![collectionCreation](https://galaxyproject.org/tutorials/collections/collectionCreation.png) In this case Galaxy automatically assigned pairs using the `_1` and `_2` endings of dataset names. Let's however pretend that this did not happen. Click on **Unpair all** (highlighted in red in the figure above) link and then on **Clear** link (highlighted in blue in the figure above). The interface will change into its virgin state: -![collectionCreationClean](https://galaxyproject.org/galaxy101/collectionCreationClean.png) +![collectionCreationClean](https://galaxyproject.org/tutorials/collections/collectionCreationClean.png) Hopefully you remember that we have paired-end data in this scenario. Datasets containing the first (forward) and the second (reverse) read are differentiated by having `_1` and `_2` in the filename. We can use this feature in dataset collection wizard to pair our datasets. Type `_1` in the left **Filter this list** text box and `_2` in the right: -![1and2](https://galaxyproject.org/galaxy101/1and2.png) +![1and2](https://galaxyproject.org/tutorials/collections/1and2.png) You will see that the dataset collection wizard will automatically filter lists on each side of the interface: -![collectionPrefiltered](https://galaxyproject.org/galaxy101/collectionPrefiltered.png) +![collectionPrefiltered](https://galaxyproject.org/tutorials/collections/collectionPrefiltered.png) Now you can either click **Auto pair** if pairs look good to you (proper combinations of datasets are listed in each line) or pair each forward/reverse group individually by pressing **Pair these datasets** button separating each pair: -![collectionCreation](https://galaxyproject.org/galaxy101/collectionCreation.png) +![collectionCreation](https://galaxyproject.org/tutorials/collections/collectionCreation.png) Now it is time to name the collection: -![collectionNaming](https://galaxyproject.org/galaxy101/collectionNaming.png) +![collectionNaming](https://galaxyproject.org/tutorials/collections/collectionNaming.png) and create the collection by clicking **Create list**. A new item will appear in the history as you can see on the panel **A** below. Clicking on collection will expand it to show four pairs it contains (panel **B**). Clicking individual pairs will expand them further to reveal **forward** and **reverse** datasets (panel **C**). Expanding these further will enable one to see individual datasets (panel **D**). -![collection_ABCD](https://galaxyproject.org/galaxy101/collection_ABCD.png) +![collection_ABCD](https://galaxyproject.org/tutorials/collections/collection_ABCD.png) ## 2.0. Using collections By now we see that a collection can be used to bundle a large number of items into a single history item. This means that many Galaxy tools will be able to process all datasets in a collection transparently to you. Let's try to map these datasets to human genome using `bwa-mem` mapper: -![bwa_mem_collection_readGroups](https://galaxyproject.org/galaxy101/bwa_mem_collection_readGroups.png) +![bwa_mem_collection_readGroups](https://galaxyproject.org/tutorials/collections/bwa_mem_collection_readGroups.png) Here is what you need to do: @@ -88,15 +88,15 @@ Here is what you need to do: You will see jobs being submitted and new datasets appearing in the history. IN particular below you can see that Galaxy has started four jobs (two yellow and two gray). This is because we have eight paired datasets with each pair being processed separately by `bwa-mem`. As a result we have four `bwa-mem` runs: -![bwa_memCollectionRunning](https://galaxyproject.org/galaxy101/bwa_memCollectionRunning.png) +![bwa_memCollectionRunning](https://galaxyproject.org/tutorials/collections/bwa_memCollectionRunning.png) Once these jobs are finished they will disappear from the history and all results will be represented as a new collection: -![bwa_memCollectionDone](https://galaxyproject.org/galaxy101/bwa_memCollectionDone.png) +![bwa_memCollectionDone](https://galaxyproject.org/tutorials/collections/bwa_memCollectionDone.png) Let's look at this collection by clicking on it (panel **A** in the figure below). You can see that now this collection is no longer paired (compared to the collection we created in the beginning of this tutorial). This is because `bwa-mem` takes forward and reverse data as input, but produces only a single BAM dataset as the output. So what we have in the result is a *list* of four dataset (BAM files; panels **B** and **C**). -![bwa_memCollection_ABC](https://galaxyproject.org/galaxy101/bwa_memCollection_ABC.png) +![bwa_memCollection_ABC](https://galaxyproject.org/tutorials/collections/bwa_memCollection_ABC.png) ## 3. Processing collection as a single entity @@ -106,11 +106,11 @@ Now that `bwa-mem` has finished and generated a collection of BAM datasets we ca Let's perform cleanup of our BAM files with `cleanSam` utility from the **Picard** package: -![cleanSam](https://galaxyproject.org/galaxy101/cleanSam.png) +![cleanSam](https://galaxyproject.org/tutorials/collections/cleanSam.png) -If you look at the picture above carefully, you will see that the **Select SAM/BAM dataset or dataset collection** parameter is empty (it says `No sam or bam datasets available.`). This is because we do not have single SAM or BAM datasets in the history. Instead we have a collection. So all you need to do is to click on the **folder** (![folder](https://galaxyproject.org/galaxy101/folder.png)) button and you will our BAM collection selected: +If you look at the picture above carefully, you will see that the **Select SAM/BAM dataset or dataset collection** parameter is empty (it says `No sam or bam datasets available.`). This is because we do not have single SAM or BAM datasets in the history. Instead we have a collection. So all you need to do is to click on the **folder** (![folder](https://galaxyproject.org/tutorials/collections/folder.png)) button and you will our BAM collection selected: -![cleanSam_closeup](https://galaxyproject.org/galaxy101/cleanSam_closeup.png) +![cleanSam_closeup](https://galaxyproject.org/tutorials/collections/cleanSam_closeup.png) Click **Execute**. As an output this tool will produce a collection contained cleaned data. @@ -118,21 +118,21 @@ Click **Execute**. As an output this tool will produce a collection contained cl Now let's clean the dataset further by only preserving truly paired reads (reads satisfying two requirements: (1) read is paired, and (2) it is mapped as a proper pair). For this we will use `Filter SAM or BAM` tools from **SAMTools** collection: -![filter](https://galaxyproject.org/galaxy101/filter.png) +![filter](https://galaxyproject.org/tutorials/collections/filter.png) parameters should be set as shown below. By setting mapping quality to `20` we avoid reads mapping to multiple locations and by using **Filter on bitwise flag** option we ensure that the resulting dataset will contain only properly paired reads. This operation will produce yet another collection containing now filtered datasets. -![filter_closeup](https://galaxyproject.org/galaxy101/filter_closeup.png) +![filter_closeup](https://galaxyproject.org/tutorials/collections/filter_closeup.png) ### 3.2. Merging collection into a single dataset The beauty of BAM datasets is that they can be combined in a single entity using so called *Read group* ([learn more](https://wiki.galaxyproject.org/Learn/GalaxyNGS101#Understanding_and_manipulating_SAM.2FBAM_datasets) about Read Groups on old wiki, which will be migrated here shortly). This allows to bundle reads from multiple experiments into a single dataset where read identity is maintained by labelling every sequence with *read group* tags. So let's finally reduce this collection to a single BAM dataset. For this we will use `MergeSamFiles` tool for the `Picard` suite: -![merge](https://galaxyproject.org/galaxy101/merge.png) +![merge](https://galaxyproject.org/tutorials/collections/merge.png) Here we select the collection generated by the filtering tool described above in [3.1](https://github.com/nekrut/galaxy/wiki/Processing-many-samples-at-once#31-retaining-proper-pairs): -![merge_closeup](https://galaxyproject.org/galaxy101/merge_closeup.png) +![merge_closeup](https://galaxyproject.org/tutorials/collections/merge_closeup.png) This operation will **not** generate a collection. Instead, it will generate a single BAM dataset containing mapped reads from our four samples (`M117-bl`, `M117-ch`, `M117C1-bl`, and `M117C1-ch`). @@ -140,19 +140,19 @@ This operation will **not** generate a collection. Instead, it will generate a s So we have one BAM dataset combining everything we've done so far. Let's look at the contents of this dataset using a genome browser. First, we will need to downsample the dataset to avoiding overwhelming the browser. For this we will use `Downsample SAM/BAM` tool: -![downsample](https://galaxyproject.org/galaxy101/downsample.png) +![downsample](https://galaxyproject.org/tutorials/collections/downsample.png) Set **Probability (between 0 and 1) that any given read will be kept** to roughly `5%` (or `0.05`) using the slider control: -![downsample_closeup](https://galaxyproject.org/galaxy101/downsample_closeup.png) +![downsample_closeup](https://galaxyproject.org/tutorials/collections/downsample_closeup.png) This will generate another BAM dataset containing only 5% of the original reads and much smaller as a result. Click on this dataset and you will see links to various genome browsers: -![browserLinks](https://galaxyproject.org/galaxy101/browserLinks.png) +![browserLinks](https://galaxyproject.org/tutorials/collections/browserLinks.png) Click the **Human hg38** link in the **display with IGV** line as highlighted above ([learn](https://wiki.galaxyproject.org/Learn/GalaxyNGS101#Visualizing_multiple_datasets_in_Integrated_Genome_Viewer_.28IGV.29) more about displaying Galaxy data in IGV with this [movie](https://vimeo.com/123442619#t=4m16s)). Below is an example generated with IGV on these data. In this screenshot reads are colored by read group (four distinct colors). A yellow inset displays additional information about a single read. One can see that this read corresponds to read group `M117-bl`. -![igv](https://galaxyproject.org/galaxy101/igv.png) +![igv](https://galaxyproject.org/tutorials/collections/igv.png) ## 5. We did not fake this: The two histories and the workflow described in this page are accessible directly from this page below: From 20f2fddb05055c63aec9201e0c875f82882f04c6 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:42:22 +0200 Subject: [PATCH 10/35] fix links in proteomics topic --- topics/proteomics/README.md | 8 ++++---- .../proteomics/tutorials/database-handling/tutorial.md | 4 ++-- .../tutorials/database-handling/workflows/readme.md | 10 +++++----- .../tutorials/protein-id-sg-ps/workflows/readme.md | 16 ++++++++-------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/topics/proteomics/README.md b/topics/proteomics/README.md index 913cf862..cb6219c6 100644 --- a/topics/proteomics/README.md +++ b/topics/proteomics/README.md @@ -2,7 +2,7 @@ GalaxyP Training material ================= topic | features -:-- | :--: +:-- | :--: [Database Handling](tutorials/database-handling.md) | [:book:](tutorials/database-handling.md) [Peptide and Protein ID](tutorials/proteinID_SG_PS.md) | [:book:](tutorials/proteinID_SG_PS.md) [Protein Quantitation by Stable Isotope Labelling (SIL)](tutorials/proteinQuant_SIL.md) | [:book:](tutorials/proteinQuant_SIL.md) @@ -24,10 +24,10 @@ You are free to use and modify all workflows for your scientific question. topic | features :-- | :--: -[N-Tails](workflows/ntails/) | [:page_facing_up:](https://galaxyproject.github.io/training-material/Proteomics/workflows/ntails/) [:book:](./workflows/ntails/) -[Secretome Prediction](workflows/secretome_prediction/) | [:page_facing_up:](https://galaxyproject.github.io/training-material/Proteomics/workflows/secretome_prediction/) [:book:](./workflows/secretome_prediction/) +[N-Tails](./tutorials/ntails/workflows/) | [:page_facing_up:](https://github.com/galaxyproject/training-material/tree/master/topics/proteomics/tutorials/ntails/workflows) [:book:](https://github.com/galaxyproject/training-material/tree/master/topics/proteomics/tutorials/ntails/workflows) +[Secretome Prediction](https://github.com/galaxyproject/training-material/tree/master/topics/proteomics/tutorials/secretome-prediction/workflows) | [:page_facing_up:](https://github.com/galaxyproject/training-material/tree/master/topics/proteomics/tutorials/secretome-prediction/workflows) [:book:](https://github.com/galaxyproject/training-material/tree/master/topics/proteomics/tutorials/secretome-prediction/workflows) ### Disclaimer Thanks for your interest in the GTN GalaxyP training material project. We only recently started working on this repository. We try to integrate more and more topics over time. Keep an eye out for new material from time to time. -If you want to join the discussion about new material to integrate, please post your thoughts [here](https://github.com/galaxyproject/training-material/issues/237). We are most thankful for any feedback! \ No newline at end of file +If you want to join the discussion about new material to integrate, please post your thoughts [here](https://github.com/galaxyproject/training-material/issues/237). We are most thankful for any feedback! diff --git a/topics/proteomics/tutorials/database-handling/tutorial.md b/topics/proteomics/tutorials/database-handling/tutorial.md index 64c80524..1b530e8a 100644 --- a/topics/proteomics/tutorials/database-handling/tutorial.md +++ b/topics/proteomics/tutorials/database-handling/tutorial.md @@ -62,7 +62,7 @@ In proteomic samples, some protein contaminants are very common, stemming from t 1. Contamination can be observed, heavily contaminated samples can be excluded from analysis. 2. Contaminant peptides cannot be misassigned to similar peptides in the database reducing the risk of identifying false positives. -A widely used database for common contaminants is the **c**ommon **R**epository of **A**dventitious **P**roteins (cRAP). When using samples generated in cell cultures, it is furthermore recommended to include Mycoplasma proteomes in the search database. Mycoplasma infections are very common in cell culture and often go unnoticed ([Drexler and Uphoff, Cytotechnology, 2002](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3463982/)). +A widely used database for common contaminants is the **c**ommon **R**epository of **A**dventitious **P**roteins (cRAP). When using samples generated in cell cultures, it is furthermore recommended to include Mycoplasma proteomes in the search database. Mycoplasma infections are very common in cell culture and often go unnoticed ([Drexler and Uphoff, Cytotechnology, 2002](https://dx.doi.org/10.1023%2FA%3A1022913015916)). > ### :pencil2: Hands-on: Contaminant databases > 1. Open **Protein Database Downloader** :wrench:. @@ -93,7 +93,7 @@ A widely used database for common contaminants is the **c**ommon **R**epository > ### :pencil2: Optional Hands-On: Mycoplasma databases -> 90 - 95 % of mycoplasma infection in cell culture stem from the following species: M. orale, M. hyorhinis, M. arginini, M. fermentans, M. hominis and A. laidlawii ([Drexler and Uphoff, Cytotechnology, 2002](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3463982/)). +> 90 - 95 % of mycoplasma infection in cell culture stem from the following species: M. orale, M. hyorhinis, M. arginini, M. fermentans, M. hominis and A. laidlawii ([Drexler and Uphoff, Cytotechnology, 2002](https://dx.doi.org/10.1023%2FA%3A1022913015916)). > > 1. Use **Protein Database Downloader** :wrench: to download the six mycoplasma databases. We will merge them to the main database in the next part of the tutorial. > 2. Run **FASTA Merge Files and Filter Unique Sequences** :wrench: to combine all mycoplasma databases into a single one. diff --git a/topics/proteomics/tutorials/database-handling/workflows/readme.md b/topics/proteomics/tutorials/database-handling/workflows/readme.md index d60a0ca4..7f0fcb31 100644 --- a/topics/proteomics/tutorials/database-handling/workflows/readme.md +++ b/topics/proteomics/tutorials/database-handling/workflows/readme.md @@ -1,6 +1,6 @@ # GalaxyP Workflow: Database Handling -This workflow supplements the GalaxyP tutorial on database handling ([Link](https://galaxyproject.github.io/training-material//Proteomics/tutorials/database-handling)). +This workflow supplements the GalaxyP tutorial on database handling ([Link](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/database-handling/tutorial.html)). You can find two versions of the same workflow: one creates a database with only the Crap archive added, the other one additionally adds the proteomes of the most common mycoplasma contaminants. An overview of the workflow is given below: @@ -9,14 +9,14 @@ An overview of the workflow is given below: ## Inputs and Customization -For this workflow, you need no prior inputs. You can change the database to download after starting to run the workflow (default: Homo sapiens). To do so, click on the `Taxonomy` option in `1: Protein Database Downloader`. -You may also change other options about the database to download (`reviewed`, `Proteome Set` and `Include isoform data`). Please refer to the [database handling tutorial](https://galaxyproject.github.io/training-material//Proteomics/tutorials/database-handling) for details. +For this workflow, you need no prior inputs. You can change the database to download after starting to run the workflow (default: Homo sapiens). To do so, click on the `Taxonomy` option in `1: Protein Database Downloader`. +You may also change other options about the database to download (`reviewed`, `Proteome Set` and `Include isoform data`). Please refer to the [database handling tutorial](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/database-handling/tutorial.html) for details. ![Input_options](../../../images/wf_databaseHandling_options.png) ## Outputs -The workflow provides two outputs: +The workflow provides two outputs: 1. the database with Crap (and mycoplasma) added, and -2. the same database with a reversed decoy database attached. \ No newline at end of file +2. the same database with a reversed decoy database attached. diff --git a/topics/proteomics/tutorials/protein-id-sg-ps/workflows/readme.md b/topics/proteomics/tutorials/protein-id-sg-ps/workflows/readme.md index 3f2a53d6..64db4ed7 100644 --- a/topics/proteomics/tutorials/protein-id-sg-ps/workflows/readme.md +++ b/topics/proteomics/tutorials/protein-id-sg-ps/workflows/readme.md @@ -1,32 +1,32 @@ # GalaxyP Workflow: Protein Identification (using Search GUI and Peptide Shaker) -This workflow supplements the GalaxyP tutorial on Protein ID ([Link](https://galaxyproject.github.io/training-material//Proteomics/tutorials/proteinID_SG_PS)). +This workflow supplements the GalaxyP tutorial on Protein ID ([Link](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/protein-id-sg-ps/tutorial.html)). You can find two versions of the same workflow: one is designed for a single MS run as an input. It can also be used for parallel analysis of multiple MS runs. Each MS run will result in a separate output file. The second workflow is designed for multiple MS runs to be combined into a single output file. - + An overview of the workflow is given below: ![Protein ID Workflow](../../images/wf_proteinID_SG_PS.png) ## Inputs -Two inputs are needed: +Two inputs are needed: -1. A protein FASTA database to be searched against. Using the current settings, a database **without decoys** is needed. The decoys will be automatically added by ***Search GUI*** :wrench: . To learn more about databases, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material//Proteomics/tutorials/database-handling). For creating a database, you can also use a [ready-made workflow](../database-handling/). +1. A protein FASTA database to be searched against. Using the current settings, a database **without decoys** is needed. The decoys will be automatically added by ***Search GUI*** :wrench: . To learn more about databases, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/database-handling/tutorial.html). For creating a database, you can also use a [ready-made workflow](../database-handling/). 2. At least one mass spectrometry data file in the mzML format. With the current settings, you will need a non-centroided mzML (raw data, no prior peak-picking). If you have data in another format or already centroided data, please consider the section [below](#customizing-the-workflow). ## Outputs -The workflow provides the identified proteins, peptides and PSMs as an output. For details on the ***Peptide Shaker*** :wrench: outputs, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material//Proteomics/tutorials/database-handling). +The workflow provides the identified proteins, peptides and PSMs as an output. For details on the ***Peptide Shaker*** :wrench: outputs, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/database-handling/tutorial.html). ## Customizing the Workflow You can customize the workflow after importing it to your Galaxy instance. Click on `Workflows`, choose this workflow and click on `Edit`. - *Using a centroided mzML file*: Delete the tool ***PeakPickerHiRes*** :wrench: , use the mzML directly as an input for ***FileConverter*** :wrench: . -- *Using another MS file format than mzML*: +- *Using another MS file format than mzML*: - For `*.mgf`: Delete the tool ***PeakPickerHiRes*** :wrench: and ***FileConverter*** :wrench: , use the mzML directly as an input for ***Search GUI*** :wrench: . - - For `*.raw`: Convert to mzML before running the workflow. For details, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material//Proteomics/tutorials/database-handling). - - For other formats: Use the ***FileConverter*** :wrench: to convert to mzML (profile data) or directly to mgf (centroided data). \ No newline at end of file + - For `*.raw`: Convert to mzML before running the workflow. For details, please consider the tutorial on [database handling](https://galaxyproject.github.io/training-material/topics/proteomics/tutorials/database-handling/tutorial.html). + - For other formats: Use the ***FileConverter*** :wrench: to convert to mzML (profile data) or directly to mgf (centroided data). From d556303f7b263adbc0dfd8135cd7bd4fcf6b79cc Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:43:07 +0200 Subject: [PATCH 11/35] fix links in sequence-analysis topic --- topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md b/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md index c10c8b7f..f9fc88f8 100644 --- a/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md +++ b/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md @@ -134,7 +134,7 @@ The demultiplexed sequences are raw sequences from the sequencing machine, witho ## Quality control -For quality control, we use similar tools as described in [NGS-QC tutorial](http://galaxyproject.github.io/training-material//NGS-QC/tutorials/dive_into_qc): [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +For quality control, we use similar tools as described in [NGS-QC tutorial](http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html): [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). > ### :pencil2: Hands-on: Quality control > From 13445b06624fa6f5dc1824aca4999512cd7c8472 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:50:27 +0200 Subject: [PATCH 12/35] fix links in training topic --- topics/training/slides/index.html | 8 +++---- .../create-new-tutorial-content/tutorial.md | 26 +++++++++++----------- .../create-new-tutorial-docker/tutorial.md | 12 +++++----- .../create-new-tutorial-jekyll/tutorial.md | 14 ++++++------ .../create-new-tutorial-metadata/tutorial.md | 12 +++++----- .../create-new-tutorial-tours/tutorial.md | 10 ++++----- 6 files changed, 41 insertions(+), 41 deletions(-) diff --git a/topics/training/slides/index.html b/topics/training/slides/index.html index 97524eaa..05f3c049 100644 --- a/topics/training/slides/index.html +++ b/topics/training/slides/index.html @@ -6,16 +6,16 @@ ### Training Ressources - Learn Galaxy @ Hub - - [https://galaxyproject.org/learn/](https://galaxyproject.org/learn/) + + [https://galaxyproject.org/learn/ ](https://galaxyproject.org/learn/) - Galaxy Training Network - [https://galaxyproject.org/teach/gtn/](https://galaxyproject.org/teach/gtn/) + [https://galaxyproject.org/teach/gtn/ ](https://galaxyproject.org/teach/gtn/) - Galaxy Trainer Directory - [https://galaxyproject.org/teach/trainers/](https://galaxyproject.org/teach/trainers/) + [https://galaxyproject.org/teach/trainers/ ](https://galaxyproject.org/teach/trainers/) --- diff --git a/topics/training/tutorials/create-new-tutorial-content/tutorial.md b/topics/training/tutorials/create-new-tutorial-content/tutorial.md index 73ae3e83..23e4a77d 100644 --- a/topics/training/tutorials/create-new-tutorial-content/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-content/tutorial.md @@ -13,9 +13,9 @@ Galaxy is a great solution to train the bioinformatics concepts: - it trains to use technology, outlining available resources and efforts that have made them accessible to researchers - it is scalable -In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. +In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. -We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material](https://github.com/galaxyproject/training-material). +We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material ](https://github.com/galaxyproject/training-material). We decided on a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a virtualised isntance to run the training everywhere. In this tutorial, you will learn how to write your first tutorial in markdown and contribute it to the Galaxy Training Network. @@ -49,7 +49,7 @@ Before anything, we need to get a local copy of the content of the GitHub reposi ## Defining the topic -The first step we need to define is in which topic putting our tutorial. This first step can be tricky. +The first step we need to define is in which topic putting our tutorial. This first step can be tricky. When we structured the repository, we decided here to use as topic the names of the categories in the [ToolShed](https://toolshed.g2.bx.psu.edu/). So when decided where to put your tutorial, you can look in which ToolShed's category are the main tools used in the tutorial and use this category as topic. For example, this tutorial will rely on the NCBI Blast+ tool. @@ -173,12 +173,12 @@ If they are not correctly defined the tutorial can not be found on the website. > ### :pencil2: Hands-on: Fix the top metadata > > 1. Change the `tutorial-name` and the `topic_name` to fit to the ones defined in the metadata -> 2. Check if the tutorial has been correctly added at [http://localhost:4000/topics/sequence-analysis/similarity-search](http://localhost:4000/topics/sequence-analysis/similarity-search) +> 2. Check if the tutorial has been correctly added at [http://localhost:4000/topics/sequence-analysis/similarity-search ](http://localhost:4000/topics/sequence-analysis/similarity-search) {: .hands_on} ### Content of the tutorial -Directly after the short metadata section on top the content of your tutorial starts. It is writen in Markdow - a simple markup langage. +Directly after the short metadata section on top the content of your tutorial starts. It is written in Markdow - a simple markup langage. > ### :bulb: Tip: Markdown > @@ -190,7 +190,7 @@ no need to add the name of the tutorial: it is automatically added based on the We recommend to structure the tutorials like this -- An introdcution to introduce the tutorial with the use case, the data, the methods +- An introduction to introduce the tutorial with the use case, the data, the methods - Several sections with the content of the tutorial and some hands-on parts (practicing is an important part of the learning process) - A conclusion to summarize what has been done in the tutorial (with a scheme) @@ -245,7 +245,7 @@ This structure needs to be respected otherwise it would not be interpreted corre > 1. [Pretreatments](#pretreatments) > 2. [Mapping](#mapping) > 3. [Analysis of the differential expression](#analysis-of-the-differential-expression) - > {: .agenda} + {: .agenda} ![](../../../../shared/images/tutorial_agenda_box.png) @@ -275,11 +275,11 @@ This structure needs to be respected otherwise it would not be interpreted corre > ### :pencil2: Hands-on: Add an hands-on box > - > 1. Add an hands-on box to run a BLAST of the small sequence dataset against the chosen database + > 1. Add an hands-on box to run a BLAST of the small sequence dataset against the chosen database {: .hands_on} -- Questions - +- Questions + The questions are then to force the trainees to think about what they are currently doing and to put things in perspective. They are also a way to help the instructors to expose and clearify misunderstanding earily on. @@ -329,7 +329,7 @@ This structure needs to be respected otherwise it would not be interpreted corre ![](../../../../shared/images/tutorial_comment_box.png) - Key points - + This last box of the tutorial is automatically filled with the take-home messages defined in the metadata @@ -362,7 +362,7 @@ to serve on the website slides related to the tutorial. The slides are written in Markdown (only the file extension is .html), as the tutorial and are rendered as a webpage thanks to [`Remark`](https://remarkjs.com). However this is not done automatically. We first need to tell the templating system to search for the slides by changing `slides` in the metadata from `no` to `yes`. -Once it is done, the slides for our tutorial will be accessible at [http://localhost:4000/topics/sequence-analysis/tutorials/similarity-search/slides.html](http://localhost:4000/topics/sequence-analysis/tutorials/similarity-search/slides.html) +Once it is done, the slides for our tutorial will be accessible at [http://localhost:4000/topics/sequence-analysis/tutorials/similarity-search/slides.html ](http://localhost:4000/topics/sequence-analysis/tutorials/similarity-search/slides.html) We can now fill the `slides.html` file: @@ -398,4 +398,4 @@ After, each new slide is introduced by `---`, and the content of each slide is w > 2. Make sure they are accessible and correctly generated {: .hands_on} -# Conclusion \ No newline at end of file +# Conclusion diff --git a/topics/training/tutorials/create-new-tutorial-docker/tutorial.md b/topics/training/tutorials/create-new-tutorial-docker/tutorial.md index 290480c0..b8d3ebfc 100644 --- a/topics/training/tutorials/create-new-tutorial-docker/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-docker/tutorial.md @@ -13,9 +13,9 @@ Galaxy is a great solution to train the bioinformatics concepts: - it trains to use technology, outlining available resources and efforts that have made them accessible to researchers - it is scalable -In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. +In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. -We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material](https://github.com/galaxyproject/training-material). +We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material ](https://github.com/galaxyproject/training-material). We decided on a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a virtualised isntance to run the training everywhere. In this tutorial, you will learn how to create a virtualised Galaxy instance, based on Docker, to run your training - either on normal computers or cloud environments. @@ -35,7 +35,7 @@ In this tutorial, you will learn how to create a virtualised Galaxy instance, ba # Building a Galaxy instance specifically for your training -To able to run the tutorial, we need a Galaxy instance where the needed tools are installed and the data. We need then to describe the needed technical infrastructure. +To able to run the tutorial, we need a Galaxy instance where the needed tools are installed and the data. We need then to describe the needed technical infrastructure. This description will be used to automatically set up a Docker Galaxy flavour and also to test if a public Galaxy instance is able to run the tool. @@ -104,7 +104,7 @@ The URL must refer to the URL of the files in Zenodo. Some of the tools require specific databases, specifically prepared for the tool. Then some Galaxy tools come with data managers to manage these databases. -If you need such data managers for your tool, you can describe their running with the `data-manager.yaml` file: +If you need such data managers for your tool, you can describe their running with the `data-manager.yaml` file: ``` data_managers: @@ -127,7 +127,7 @@ data_managers: Once the tutorial is ready, we need to extract workflows with the different steps of the tutorial and add them to the `workflows` directory in the tutorial with some explanation about the tutorial in a `README.md` file -> ### :pencil2: Hands-on: Extract the workflow +> ### :pencil2: Hands-on: Extract the workflow > > 1. Extract the workflow for the tutorial > 2. Add some description about the tutorial in a `README.md` file with the workflow file @@ -140,7 +140,7 @@ It is a great way to run the tutorial directly inside Galaxy. To learn more abou ## Testing the technical infrastructure -Once we defined all the requirements for running the tutorial, we can test these requirements. +Once we defined all the requirements for running the tutorial, we can test these requirements. Every topic will come with a Docker image containing the tools, data, workflows and Galaxy Interactive Tours required by each tutorial of this topic. The Docker image is described in the Dockerfile found in the `docker` directory of each topic. This file uses scripts to automatically add the files for each tutorial. The only thing to change is the name of the topic in the Dockerfile copied from the templates. diff --git a/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md b/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md index 01f7bb58..42097788 100644 --- a/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md @@ -13,9 +13,9 @@ Galaxy is a great solution to train the bioinformatics concepts: - it trains to use technology, outlining available resources and efforts that have made them accessible to researchers - it is scalable -In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. +In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. -We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material](https://github.com/galaxyproject/training-material). +We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material ](https://github.com/galaxyproject/training-material). We decided on a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a virtualised isntance to run the training everywhere. In this tutorial, you will learn how to run a local instance of the GTN webiste with all materials to test and develop new training sessions. @@ -41,19 +41,19 @@ We can use Jekyll to run a server to check if the tutorial is correctly added an > ### :pencil2: Hands-on: Checking the website generation locally > > 1. Install Jekyll using [RubyGems](https://rubygems.org/pages/download): `make install` -> +> > 2. Run a local Jekyll server: `make serve` -> 3. Visualize at [http://localhost:4000/](http://localhost:4000/) -> +> 3. Visualize at [http://localhost:4000/ ](http://localhost:4000/) +> > > ### :question: Questions > > > > How to check if the server was started and if all topics are included? > > > >
> > Click to view answers -> > Please check [http://localhost:4000/topics/](http://localhost:4000/topics/) to get a list of topics. +> > Please check [http://localhost:4000/topics/ ](http://localhost:4000/topics/) to get a list of topics. > >
> {: .question} {: .hands_on} -# Conclusion \ No newline at end of file +# Conclusion diff --git a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md index 4337c8a0..5565b352 100644 --- a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md @@ -13,9 +13,9 @@ Galaxy is a great solution to train the bioinformatics concepts: - it trains to use technology, outlining available resources and efforts that have made them accessible to researchers - it is scalable -In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. +In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as always in Galaxy. -We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material](https://github.com/galaxyproject/training-material). +We took inspiration from [Software Carpentry](https://software-carpentry.org) and collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material ](https://github.com/galaxyproject/training-material). We decided on a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a virtualised isntance to run the training everywhere. In this tutorial, you will learn how to annotate your training material with a lot of metadata, so that it can be reused and empower other services. @@ -49,7 +49,7 @@ The first file we will fill is the `metadata.yaml` file describing the metadata - `link`: relative for internal (inside training material) requirement or full for external requirement) - `type`: the type of link (`internal` or `external`) -This information is used to automatically make the tutorial available on the online website: [http://galaxyproject.github.io/training-material/](http://galaxyproject.github.io/training-material/) +This information is used to automatically make the tutorial available on the online website: [http://galaxyproject.github.io/training-material/ ](http://galaxyproject.github.io/training-material/) > ### :pencil2: Hands-on: Fill the basic metadata > @@ -74,9 +74,9 @@ In the second part of the metadata, we define metadata related to the content of - `key_points`: list of take-home messages - This information will appear at the end of the tutorial + This information will appear at the end of the tutorial -For this metadata, we take inspiration from what Software Carpentry is doing and particularly what they describe in their [Instructor training](http://swcarpentry.github.io/instructor-training/) and the section ["Lessons and Objectives"](http://swcarpentry.github.io/instructor-training/19-lessons/). +For this metadata, we take inspiration from what Software Carpentry is doing and particularly what they describe in their [Instructor training](http://swcarpentry.github.io/instructor-training/) and the section ["Lessons and Objectives"](http://swcarpentry.github.io/instructor-training/19-lessons/). > ### :pencil2: Hands-on: Fill the pedagogical metadata > @@ -88,4 +88,4 @@ We recommend you to fill the questions and the learning objectives before starti For the take-home messages, it is easier to define them once the tutorial is written and you identified the issues. -# Conclusion \ No newline at end of file +# Conclusion diff --git a/topics/training/tutorials/create-new-tutorial-tours/tutorial.md b/topics/training/tutorials/create-new-tutorial-tours/tutorial.md index 04edf6e7..f325a11f 100644 --- a/topics/training/tutorials/create-new-tutorial-tours/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-tours/tutorial.md @@ -13,11 +13,11 @@ Galaxy is a great solution to train the bioinformatics concepts: - it trains to use technology, outlining available resources and efforts that have made them accessible to researchers - it is scalable -In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as most of the time in Galaxy. +In 2016, the Galaxy Training Network decide to set up a new infrastructure for delivering easily Galaxy related training material. The idea was to develop something open and online based on a community effort, as most of the time in Galaxy. -We take inspiration from [Software Carpentry](https://software-carpentry.org). We collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material](https://github.com/galaxyproject/training-material). We decided a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a technical support to be able to run. +We take inspiration from [Software Carpentry](https://software-carpentry.org). We collected everything on a GitHub repository: [https://github.com/galaxyproject/training-material ](https://github.com/galaxyproject/training-material). We decided a structure based on tutorials with hands-on, fitting both for online self-training but also for workshops, grouped in topics. Each tutorial follows the same structure and comes with a technical support to be able to run. -In this tutorial, you will understand how to design and develop a new tutorial fitting in this training material repository. As doing helps to understand, we will develop a small tutorial to explain BLAST with the full infrastructure to be able to run this tutorial anywhere. +In this tutorial, you will understand how to design and develop a new tutorial fitting in this training material repository. As doing helps to understand, we will develop a small tutorial to explain BLAST with the full infrastructure to be able to run this tutorial anywhere. > ### Devloping GTN training material > @@ -48,7 +48,7 @@ title_default: "Welcome to Galaxy" steps: - title: "Welcome to Galaxy" content: "This short tour will guide you through Galaxy's user interface.
- You can navigate with your arrow keys and leave the tour at any time point + You can navigate with your arrow keys and leave the tour at any time point with 'Escape' or the 'End tour' button." backdrop: true @@ -88,7 +88,7 @@ The YAML file of a tour can be integrated in a Galaxy instance by placing the YA [A Web browser plugin](https://github.com/TailorDev/galaxy-tourbuilder) is available to help the creation and the test (on the fly) of an interactive tour. -
Galaxy Tour Builder by TailorDev
+
Galaxy Tour Builder by TailorDev
> ### :pencil2: Hands-on: Install and start the plugin > From a6bed5fe5629c9927e45ca6c11352e2c2bfe4845 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 17:53:57 +0200 Subject: [PATCH 13/35] update link to closed-access article to author's pdf version --- topics/transcriptomics/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/transcriptomics/README.md b/topics/transcriptomics/README.md index 317a678d..e656a652 100644 --- a/topics/transcriptomics/README.md +++ b/topics/transcriptomics/README.md @@ -39,7 +39,7 @@ It will launch a flavored Galaxy instance available on **Shirley Pepke et al:** [Computation for ChIP-seq and RNA-seq studies](http://www.nature.com/nmeth/journal/v6/n11s/full/nmeth.1371.html) -**Paul L. Auer & R. W. Doerge:** [Statistical Design and Analysis of RNA Sequencing Data](http://www.genetics.org/content/185/2/405) +**Paul L. Auer & R. W. Doerge:** [Statistical Design and Analysis of RNA Sequencing Data](http://www.stat.purdue.edu/~doerge/BIOINFORM.D/SPRING10/auer_doerge_genetics_2010.pdf) DOI: 10.1534/genetics.110.114983 > Insights into proper planning of your RNA-seq run! To read before any RNA-seq experiment! From e400167f19413d10fb6cb17c469422f980db30fe Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 18:10:46 +0200 Subject: [PATCH 14/35] fix links in dev topic --- topics/dev/tutorials/architecture/slides.html | 68 +++++++++++----------- topics/dev/tutorials/conda/slides.html | 16 ++--- topics/dev/tutorials/containers/slides.html | 4 +- .../dev/tutorials/visualization-charts/tutorial.md | 8 +-- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/topics/dev/tutorials/architecture/slides.html b/topics/dev/tutorials/architecture/slides.html index d1be6969..c4aadbd1 100644 --- a/topics/dev/tutorials/architecture/slides.html +++ b/topics/dev/tutorials/architecture/slides.html @@ -32,11 +32,11 @@ --- -**Gitter:** [galaxyproject/Lobby](https://gitter.im/galaxyproject/Lobby) +**Gitter:** [galaxyproject/Lobby ](https://gitter.im/galaxyproject/Lobby) -**IRC:** [irc.freenode.net#galaxyproject](https://webchat.freenode.net/?channels=galaxyproject) +**IRC:** [irc.freenode.net#galaxyproject ](https://webchat.freenode.net/?channels=galaxyproject) -**GitHub:** [github.com/galaxyproject](https://github.com/galaxyproject) +**GitHub:** [github.com/galaxyproject ](https://github.com/galaxyproject) **Twitter:** #usegalaxy, @galaxyproject @@ -54,7 +54,7 @@ --- -[github.com/galaxyproject/**galaxy**](https://github.com/galaxyproject/galaxy) +[galaxyproject/**galaxy** ](https://github.com/galaxyproject/galaxy) The main Galaxy application. @@ -64,18 +64,18 @@ --- -[github.com/galaxyproject/**cloudman**](https://github.com/galaxyproject/cloudman) +[galaxyproject/**cloudman**](https://github.com/galaxyproject/cloudman) Galaxy CloudMan - a web application which manages a Galaxy cluster in the cloud. -[github.com/galaxyproject/**cloudlaunch**](https://github.com/galaxyproject/cloudlaunch) +[galaxyproject/**cloudlaunch** ](https://github.com/galaxyproject/cloudlaunch) CloudLaunch web application to make it easy to launch images on a cloud, drives *https://launch.usegalaxy.org* --- -[github.com/galaxyproject/**tools-iuc**](https://github.com/galaxyproject/tools-iuc) +[galaxyproject/**tools-iuc** ](https://github.com/galaxyproject/tools-iuc) Galaxy tools maintained by the *IUC* ("Intergalactic Utilities Commission"). @@ -84,7 +84,7 @@ Demonstrates *current tool development best practices* - development on github and then deployed to test/main ToolSheds -[github.com/galaxyproject/**tools-devteam**](https://github.com/galaxyproject/tools-devteam) +[galaxyproject/**tools-devteam** ](https://github.com/galaxyproject/tools-devteam) Many older tools appearing on usegalaxy.org. @@ -112,7 +112,7 @@ --- -[github.com/galaxyproject/**starforge**](https://github.com/galaxyproject/starforge) +[galaxyproject/**starforge** ](https://github.com/galaxyproject/starforge) Build Galaxy Tool dependencies for the ToolShed in Docker containers @@ -120,21 +120,21 @@ --- -[github.com/galaxyproject/**planemo**](https://github.com/galaxyproject/planemo) +[galaxyproject/**planemo** ](https://github.com/galaxyproject/planemo) Commande line utilities to assist in the development of Galaxy tools. Linting, testing, deploying to ToolSheds... *The best practice approach for Galaxy tool development!* -[github.com/galaxyproject/**planemo-machine**](https://github.com/galaxyproject/planemo-machine) +[galaxyproject/**planemo-machine** ](https://github.com/galaxyproject/planemo-machine) Builds Galaxy environments for Galaxy tool development including Docker container, virtual machines, Google compute images --- -github.com/galaxyproject/**{ansible-\*, \*-playbook}** +galaxyproject/**{ansible-\*, \*-playbook}** [Ansible](https://www.ansible.com/) components to automate almost every aspect of Galaxy installation and maintenance. @@ -144,7 +144,7 @@ --- -[github.com/galaxyproject/**pulsar**](https://github.com/galaxyproject/pulsar) +[galaxyproject/**pulsar**](https://github.com/galaxyproject/pulsar) Distributed job execution engine for Galaxy. @@ -156,7 +156,7 @@ --- -[github.com/galaxyproject/**bioblend**](https://github.com/galaxyproject/bioblend) +[galaxyproject/**bioblend** ](https://github.com/galaxyproject/bioblend) Official Python client for the Galaxy, ToolShed, and CloudMan APIs. @@ -164,15 +164,15 @@ --- -- [github.com/galaxyproject/**blend4php**](https://github.com/galaxyproject/blend4php) -- [github.com/**jmchilton/blend4j**](https://github.com/jmchilton/blend4j) -- [github.com/**chapmanb/clj-blend**](https://github.com/chapmanb/clj-blend) +- [galaxyproject/**blend4php**](https://github.com/galaxyproject/blend4php) +- [**jmchilton/blend4j**](https://github.com/jmchilton/blend4j) +- [**chapmanb/clj-blend**](https://github.com/chapmanb/clj-blend) Galaxy API bindings for other languages. --- -[github.com/**bgruening/docker-galaxy-stable**](https://github.com/bgruening/docker-galaxy-stable) +[**bgruening/docker-galaxy-stable** ](https://github.com/bgruening/docker-galaxy-stable) High quality Docker containers for stable Galaxy environments. @@ -237,7 +237,7 @@ ??? If the chief architectual principle guiding the frontend is a fast and accessible -experience for the bench scientist, perhaps for the backend it is allowing +experience for the bench scientist, perhaps for the backend it is allowing deployment on many different platforms and a different scales. --- @@ -623,7 +623,7 @@ Controllers should ideally be thin wrappers around actions defined in managers. -Whenever model require more than just the database, the operation should be defined +Whenever model require more than just the database, the operation should be defined in a manager instead of in the model. --- @@ -951,21 +951,21 @@ migrate.versioning.script.base DEBUG 2016-06-23 19:11:51,994 Loading script lib/galaxy/model/migrate/versions/0002_metadata_file_table.py... migrate.versioning.script.base DEBUG 2016-06-23 19:11:52,009 Loading script lib/galaxy/model/migrate/versions/0131_subworkflow_and_input_parameter_modules.py... ... -galaxy.model.migrate.check INFO 2016-06-23 19:13:32,812 Migrating 128 -> 129... -galaxy.model.migrate.check INFO 2016-06-23 19:13:33,436 +galaxy.model.migrate.check INFO 2016-06-23 19:13:32,812 Migrating 128 -> 129... +galaxy.model.migrate.check INFO 2016-06-23 19:13:33,436 galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 Migration script to allow invalidation of job external output metadata temp files -galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 -galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 -galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 Migrating 129 -> 130... -galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 +galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 +galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 +galaxy.model.migrate.check INFO 2016-06-23 19:13:33,437 Migrating 129 -> 130... +galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 Migration script to change the value column of user_preference from varchar to text. -galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 -galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 -galaxy.model.migrate.check INFO 2016-06-23 19:13:34,326 Migrating 130 -> 131... -galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 +galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 +galaxy.model.migrate.check INFO 2016-06-23 19:13:34,325 +galaxy.model.migrate.check INFO 2016-06-23 19:13:34,326 Migrating 130 -> 131... +galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 Migration script to support subworkflows and workflow request input parameters -galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 -galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 +galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 +galaxy.model.migrate.check INFO 2016-06-23 19:13:35,633 ```] --- @@ -1133,7 +1133,7 @@ class: reduce90 -### Tool Dependency Resolution and Indexing +### Tool Dependency Resolution and Indexing .code[``` galaxy.tools.deps WARNING 2016-06-23 19:13:36,498 Path './database/dependencies' does not exist, ignoring @@ -1360,7 +1360,7 @@ Jobs across many clusters ] -*http://usegalaxy.org/production* +* http://usegalaxy.org/production * --- diff --git a/topics/dev/tutorials/conda/slides.html b/topics/dev/tutorials/conda/slides.html index 7bf759b7..83a5af34 100644 --- a/topics/dev/tutorials/conda/slides.html +++ b/topics/dev/tutorials/conda/slides.html @@ -34,7 +34,7 @@ --help_from_command 'seqtk seq' ``` -Notice the `--requirement seqtk@1.2`. +Notice the `--requirement seqtk@1.2`. --- @@ -58,7 +58,7 @@ ??? -- Notice that multiple tools may be mapped to the same requirements and +- Notice that multiple tools may be mapped to the same requirements and any tools may use multiple Conda recipes. - There are few different ways to populate Applications and Libraries on the right - we will talk about Conda which is what we consider the @@ -181,7 +181,7 @@ ## Conda and Galaxy Galaxy now automatically installs Conda when first launched and will use [Bioconda](https://bioconda.github.io/) and other channels -for package resolution. +for package resolution. --- @@ -301,7 +301,7 @@ ## Using the Tool Environment -Now that we have verified the Conda environment setup with `conda_install` works properly on the +Now that we have verified the Conda environment setup with `conda_install` works properly on the command-line, we can use our tool! `planemo test` and `planemo serve` will use this environment by default now for this tool. @@ -660,7 +660,7 @@ 1. Fork [Bioconda](https://github.com/bioconda/bioconda-recipes/fork). 1. Clone your fork: - `git clone https://github.com/myuser/bioconda-recipes` + `git clone https://github.com//bioconda-recipes` 1. Create a new branch `package` `git checkout -b package` @@ -756,7 +756,7 @@ source: fn: {{ name }}-{{ version }}.zip url: http://coolsoftware.com/{{ name }}/{{ version }}/{{ name }}-{{ version }}.zip - + {% endraw %} ``` @@ -833,7 +833,7 @@ ```sh #!/bin/bash # Remove gcc statements that do not work on older compilers for CentOS5 -# support, from https://github.com/chapmanb/bcbio-conda/blob/master/pysam/build.sh +# support sed -i'' -e 's/"-Wno-error=declaration-after-statement",//g' setup.py sed -i'' -e 's/"-Wno-error=declaration-after-statement"//g' setup.py # linking htslib, see: @@ -1075,7 +1075,7 @@ - Module::Build::PPMMaker about: - home: http://metacpan.org/pod/Module-Build + home: http://metacpan.org/pod/Module::Build license: perl_5 summary: 'Build and install Perl modules ``` diff --git a/topics/dev/tutorials/containers/slides.html b/topics/dev/tutorials/containers/slides.html index 93d7c810..d971da78 100644 --- a/topics/dev/tutorials/containers/slides.html +++ b/topics/dev/tutorials/containers/slides.html @@ -343,7 +343,7 @@ ## Galaxy Terminology - Mulled -To *mull* is to create an environment (either in the Conda sense or globally +To *mull* is to create an environment (either in the Conda sense or globally inside a container) for one or more Conda packages. The result of this is a *mulled* environment. @@ -465,7 +465,7 @@ [galaxy.tools.deps] Using dependency bwa version 0.7.15 of type conda [galaxy.tools.deps] Using dependency samtools version 1.3.1 of type conda [galaxy.tools.deps.containers] Checking with container resolver [ExplicitContainerResolver[]] found description [None] -[galaxy.tools.deps.containers] Checking with container resolver [CachedMulledContainerResolver[namespace=None]] found description [ContainerDescription[identifier=quay.io/biocontainers/mulled-v1-01afc412d1f216348d85970ce5f88c984aa443f3:latest,type=docker]] +[galaxy.tools.deps.containers] Checking with container resolver [CachedMulledContainerResolver[namespace=None]] found description [ContainerDescription[identifier=quay.io/ biocontainers/mulled-v1-01afc412d1f216348d85970ce5f88c984aa443f3:latest,type=docker]] [galaxy.jobs.command_factory] Built script [/tmp/tmpQs0gyp/job_working_directory/000/1/tool_script.sh] for tool command [bwa > /tmp/tmpQs0gyp/files/000/dataset_1.dat 2>&1 ; samtools > /tmp/tmpQs0gyp/files/000/dataset_2.dat 2>&1] [galaxy.tools.deps] Using dependency samtools version None of type conda [galaxy.tools.deps] Using dependency samtools version None of type conda diff --git a/topics/dev/tutorials/visualization-charts/tutorial.md b/topics/dev/tutorials/visualization-charts/tutorial.md index 865a5625..60137fd9 100644 --- a/topics/dev/tutorials/visualization-charts/tutorial.md +++ b/topics/dev/tutorials/visualization-charts/tutorial.md @@ -48,11 +48,11 @@ In this tutorial we are going to demonstrate how to add a 3rd-party visualizatio > > More resources on this file format: > -> - [https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)](https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)) -> - [http://www.wwpdb.org/documentation/file-format](http://www.wwpdb.org/documentation/file-format) +> - [https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format) ](https://en.wikipedia.org/wiki/Protein_Data_Bank_(file_format)) +> - [http://www.wwpdb.org/documentation/file-format ](http://www.wwpdb.org/documentation/file-format) {: .tip} -As mentioned above we will be focusing on the *PV-Javascript Protein Viewer* in this tutorial. Now that we have learned about the underlying file format, let us continue by visiting the viewers developer site at [https://biasmv.github.io/pv/](https://biasmv.github.io/pv/) to get familiar with the plugin. +As mentioned above we will be focusing on the *PV-Javascript Protein Viewer* in this tutorial. Now that we have learned about the underlying file format, let us continue by visiting the viewers developer site at [https://biasmv.github.io/pv/ ](https://biasmv.github.io/pv/) to get familiar with the plugin. > ### :pencil2: Hands-on > @@ -245,7 +245,7 @@ In this section we will select a `PDB`-file from the Protein Databank and visual > ### :pencil2: Hands-on > -> 1. Visit [http://www.rcsb.org](http://www.rcsb.org) and select a protein structure e.g. [1ACB](http://www.rcsb.org/pdb/explore/explore.do?structureId=1acb) +> 1. Visit [http://www.rcsb.org ](http://www.rcsb.org) and select a protein structure e.g. [1ACB](http://www.rcsb.org/pdb/explore/explore.do?structureId=1acb) > > 2. Copy the link to the raw `PDB`-file e.g. > From 38c3f35bf8d11949a7a1c469ab789794ac194bb9 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Tue, 4 Jul 2017 18:13:06 +0200 Subject: [PATCH 15/35] add some urls to whitelis bioteam.net always return 403 for some reason --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4397f7ba..3572a8b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ before_script: script: - set -e # Check links - - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist http://localhost:8080 {}" + - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist http://localhost:8080,bioteam.net,publish.twitter.com {}" # # Check structure #- ./bin/check_structure.py From 1c6e74bbc9a91c4e11234d4010bc8cbc7c7a88da Mon Sep 17 00:00:00 2001 From: shiltemann Date: Wed, 5 Jul 2017 14:55:21 +0200 Subject: [PATCH 16/35] fix links variant analysis topic --- topics/variant-analysis/slides/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/topics/variant-analysis/slides/index.html b/topics/variant-analysis/slides/index.html index dfdb4805..9dee9140 100644 --- a/topics/variant-analysis/slides/index.html +++ b/topics/variant-analysis/slides/index.html @@ -72,5 +72,5 @@ ### 2 tutorials -- [Introductory tutorial](http://galaxyproject.github.io/training-material/Exome-Seq/tutorials/Exome-Seq.html) -- [Detailed tutorial](http://galaxyproject.github.io/training-material/Exome-Seq/tutorials/Diploid-variant-calling.html) +- [Introductory tutorial](http://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/exome-seq/tutorial.html) +- [Detailed tutorial](http://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.html) From 50f4c23274f951be5c5bc89b28a6c6680783fcaa Mon Sep 17 00:00:00 2001 From: shiltemann Date: Wed, 5 Jul 2017 17:51:01 +0200 Subject: [PATCH 17/35] fix link --- topics/usegalaxy/tutorials/dunovo/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/usegalaxy/tutorials/dunovo/tutorial.md b/topics/usegalaxy/tutorials/dunovo/tutorial.md index ad7827d2..6233df87 100644 --- a/topics/usegalaxy/tutorials/dunovo/tutorial.md +++ b/topics/usegalaxy/tutorials/dunovo/tutorial.md @@ -34,7 +34,7 @@ The entire analysis described here is accessible as a [Galaxy history](https://u >![History Item](http://galaxyproject.org/duplex/histItem.png) > ->Each history item has a Rerun ![refresh](http://galaxyproject.org/galaxy101/fa-refresh.png) button. Clicking this button will show you how this tool was run with all parameters filled in exactly. +>Each history item has a Rerun ![refresh](https://galaxyproject.org/tutorials/g101/fa-refresh.png) button. Clicking this button will show you how this tool was run with all parameters filled in exactly. This analysis (and consequently the Galaxy's history) can be divided into three parts 1. Consensus generation from initial sequencing reads; From abb4b110d87a33e424c4845041a145a692c8268b Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 15:29:07 +0200 Subject: [PATCH 18/35] use https for training material links --- topics/admin/tutorials/advanced-galaxy-customisation/slides.html | 2 +- topics/admin/tutorials/database-schema/tutorial.md | 2 +- topics/admin/tutorials/dev-to-production/tutorial.md | 2 +- topics/dev/tutorials/interactive-tours/slides.html | 2 +- topics/dev/tutorials/webhooks/slides.html | 2 +- topics/introduction/README.md | 2 +- topics/introduction/slides/index.html | 2 +- topics/proteomics/tutorials/database-handling/tutorial.md | 4 ++-- topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-metadata/tutorial.md | 4 ++-- topics/transcriptomics/README.md | 4 ++-- topics/usegalaxy/README.md | 4 ++-- topics/variant-analysis/slides/index.html | 4 ++-- 13 files changed, 18 insertions(+), 18 deletions(-) diff --git a/topics/admin/tutorials/advanced-galaxy-customisation/slides.html b/topics/admin/tutorials/advanced-galaxy-customisation/slides.html index 32997edb..2ab11be5 100644 --- a/topics/admin/tutorials/advanced-galaxy-customisation/slides.html +++ b/topics/admin/tutorials/advanced-galaxy-customisation/slides.html @@ -257,7 +257,7 @@ -- -* Embedding [twitter feeds](http://publish.twitter.com) is popular, e.g. [usegalaxy.org](https://usegalaxy.org) +* Embedding [twitter feeds](https://publish.twitter.com) is popular, e.g. [usegalaxy.org](https://usegalaxy.org) -- diff --git a/topics/admin/tutorials/database-schema/tutorial.md b/topics/admin/tutorials/database-schema/tutorial.md index e2b4d1f1..cf5cc5c5 100644 --- a/topics/admin/tutorials/database-schema/tutorial.md +++ b/topics/admin/tutorials/database-schema/tutorial.md @@ -58,7 +58,7 @@ What’s not in the database is the data. Datasets are stored outside the databa Entity-relationship diagrams are a way to understand tables and the relationships between them inside a relational database. SchemaSpy (http://schemaspy.sourceforge.net/) is a free (and remarkable tool) for generating ER diagrams. We’be used it generate a description of the database backing the server in this container. See - https://galaxyproject.org /schema/SchemaSpy/index.html + https://galaxyproject.org/schema/SchemaSpy/index.html The “Tables” tab is a good place to start learning the structure of the database. Each table represents a different type of thing, and often that thing is itself a relationship. For example, each record in the dataset table has information about a specific dataset, while records in the history_dataset_association table have information about what histories that dataset is in. diff --git a/topics/admin/tutorials/dev-to-production/tutorial.md b/topics/admin/tutorials/dev-to-production/tutorial.md index f91b24d7..8efb1391 100644 --- a/topics/admin/tutorials/dev-to-production/tutorial.md +++ b/topics/admin/tutorials/dev-to-production/tutorial.md @@ -20,7 +20,7 @@ Move from dev instance to production instance :heavy_check_mark: ***Requirements*** -- *[Galaxy Server Administration](http://galaxyproject.github.io/training-material/topics/admin/slides/)* +- *[Galaxy Server Administration](https://galaxyproject.github.io/training-material/topics/admin/slides/)* :hourglass: ***Time estimation*** *TODO* diff --git a/topics/dev/tutorials/interactive-tours/slides.html b/topics/dev/tutorials/interactive-tours/slides.html index b87421ce..e92220fe 100644 --- a/topics/dev/tutorials/interactive-tours/slides.html +++ b/topics/dev/tutorials/interactive-tours/slides.html @@ -94,7 +94,7 @@ - Create Tours and share them with the world! - on the [community collection of Interactive Tours](https://github.com/galaxyproject/galaxy-tours) or - - add them to the [Galaxy Training Network](http://galaxyproject.github.io/training-material/) + - add them to the [Galaxy Training Network](https://galaxyproject.github.io/training-material/) - Improve the Tours implementation - Improve creating of Tours - by enhancing the [Galaxy Tour Builder](https://github.com/dannon/tourbuilder) web extension diff --git a/topics/dev/tutorials/webhooks/slides.html b/topics/dev/tutorials/webhooks/slides.html index 31e73c14..90d8d44d 100644 --- a/topics/dev/tutorials/webhooks/slides.html +++ b/topics/dev/tutorials/webhooks/slides.html @@ -113,7 +113,7 @@ - add additional ones ... - Improve the documentation or training material - [Documentation](https://docs.galaxyproject.org/en/latest/admin/special_topics/webhooks.html) - - [Galaxy Training Network](http://galaxyproject.github.io/training-material/) + - [Galaxy Training Network](https://galaxyproject.github.io/training-material/) --- diff --git a/topics/introduction/README.md b/topics/introduction/README.md index 4aac5be2..36ec11e8 100644 --- a/topics/introduction/README.md +++ b/topics/introduction/README.md @@ -9,7 +9,7 @@ Here, you will find some material to learn how to use Galaxy. A deck of slides is available for this topic: -- [General introduction about Galaxy](http://galaxyproject.github.io/training-material/topics/introduction/slides/) +- [General introduction about Galaxy](https://galaxyproject.github.io/training-material/topics/introduction/slides/) # Tutorials diff --git a/topics/introduction/slides/index.html b/topics/introduction/slides/index.html index f300c1e2..793ea093 100644 --- a/topics/introduction/slides/index.html +++ b/topics/introduction/slides/index.html @@ -231,4 +231,4 @@ - Be part of an active and friendly community - Get support and your questions answered on [Galaxy Biostars](https://biostar.usegalaxy.org/) - Access community curated documentation on [Galaxy Community Hub](https://www.galaxyproject.org/) -- Learn more about Galaxy for scientists and for developers and admins on [Galaxy Training Community](http://galaxyproject.github.io/training-material/) +- Learn more about Galaxy for scientists and for developers and admins on [Galaxy Training Community](https://galaxyproject.github.io/training-material/) diff --git a/topics/proteomics/tutorials/database-handling/tutorial.md b/topics/proteomics/tutorials/database-handling/tutorial.md index 1b530e8a..64c80524 100644 --- a/topics/proteomics/tutorials/database-handling/tutorial.md +++ b/topics/proteomics/tutorials/database-handling/tutorial.md @@ -62,7 +62,7 @@ In proteomic samples, some protein contaminants are very common, stemming from t 1. Contamination can be observed, heavily contaminated samples can be excluded from analysis. 2. Contaminant peptides cannot be misassigned to similar peptides in the database reducing the risk of identifying false positives. -A widely used database for common contaminants is the **c**ommon **R**epository of **A**dventitious **P**roteins (cRAP). When using samples generated in cell cultures, it is furthermore recommended to include Mycoplasma proteomes in the search database. Mycoplasma infections are very common in cell culture and often go unnoticed ([Drexler and Uphoff, Cytotechnology, 2002](https://dx.doi.org/10.1023%2FA%3A1022913015916)). +A widely used database for common contaminants is the **c**ommon **R**epository of **A**dventitious **P**roteins (cRAP). When using samples generated in cell cultures, it is furthermore recommended to include Mycoplasma proteomes in the search database. Mycoplasma infections are very common in cell culture and often go unnoticed ([Drexler and Uphoff, Cytotechnology, 2002](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3463982/)). > ### :pencil2: Hands-on: Contaminant databases > 1. Open **Protein Database Downloader** :wrench:. @@ -93,7 +93,7 @@ A widely used database for common contaminants is the **c**ommon **R**epository > ### :pencil2: Optional Hands-On: Mycoplasma databases -> 90 - 95 % of mycoplasma infection in cell culture stem from the following species: M. orale, M. hyorhinis, M. arginini, M. fermentans, M. hominis and A. laidlawii ([Drexler and Uphoff, Cytotechnology, 2002](https://dx.doi.org/10.1023%2FA%3A1022913015916)). +> 90 - 95 % of mycoplasma infection in cell culture stem from the following species: M. orale, M. hyorhinis, M. arginini, M. fermentans, M. hominis and A. laidlawii ([Drexler and Uphoff, Cytotechnology, 2002](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3463982/)). > > 1. Use **Protein Database Downloader** :wrench: to download the six mycoplasma databases. We will merge them to the main database in the next part of the tutorial. > 2. Run **FASTA Merge Files and Filter Unique Sequences** :wrench: to combine all mycoplasma databases into a single one. diff --git a/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md b/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md index f9fc88f8..31f23cbb 100644 --- a/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md +++ b/topics/sequence-analysis/tutorials/de-novo-rad-seq/tutorial.md @@ -134,7 +134,7 @@ The demultiplexed sequences are raw sequences from the sequencing machine, witho ## Quality control -For quality control, we use similar tools as described in [NGS-QC tutorial](http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html): [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +For quality control, we use similar tools as described in [NGS-QC tutorial](https://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html): [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). > ### :pencil2: Hands-on: Quality control > diff --git a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md index 5565b352..734c5aca 100644 --- a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md @@ -49,7 +49,7 @@ The first file we will fill is the `metadata.yaml` file describing the metadata - `link`: relative for internal (inside training material) requirement or full for external requirement) - `type`: the type of link (`internal` or `external`) -This information is used to automatically make the tutorial available on the online website: [http://galaxyproject.github.io/training-material/ ](http://galaxyproject.github.io/training-material/) +This information is used to automatically make the tutorial available on the online website: [https://galaxyproject.github.io/training-material/ ](https://galaxyproject.github.io/training-material/) > ### :pencil2: Hands-on: Fill the basic metadata > @@ -76,7 +76,7 @@ In the second part of the metadata, we define metadata related to the content of This information will appear at the end of the tutorial -For this metadata, we take inspiration from what Software Carpentry is doing and particularly what they describe in their [Instructor training](http://swcarpentry.github.io/instructor-training/) and the section ["Lessons and Objectives"](http://swcarpentry.github.io/instructor-training/19-lessons/). +For this metadata, we take inspiration from what Software Carpentry is doing and particularly what they describe in their [Instructor training](https://swcarpentry.github.io/instructor-training/) and the section ["Lessons and Objectives"](https://swcarpentry.github.io/instructor-training/19-lessons/). > ### :pencil2: Hands-on: Fill the pedagogical metadata > diff --git a/topics/transcriptomics/README.md b/topics/transcriptomics/README.md index e656a652..fce3d48f 100644 --- a/topics/transcriptomics/README.md +++ b/topics/transcriptomics/README.md @@ -7,13 +7,13 @@ RNA-sequencing is a method used to reveal the presence and quantity of RNA in a A deck of slides is available for this topic: -- [General introduction about RNA seq data analysis](http://galaxyproject.github.io/training-material/topics/transcriptomics/slides/) +- [General introduction about RNA seq data analysis](https://galaxyproject.github.io/training-material/topics/transcriptomics/slides/) # Tutorials A tutorial with hands-on is available for this topic: -- [Reference-based RNA-seq data analysis](http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html) +- [Reference-based RNA-seq data analysis](https://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html) ## Input datasets diff --git a/topics/usegalaxy/README.md b/topics/usegalaxy/README.md index bd704cc0..657408dd 100644 --- a/topics/usegalaxy/README.md +++ b/topics/usegalaxy/README.md @@ -7,9 +7,9 @@ Topic name Several deck of slides are available for this topic: -- [General introduction about ](http://galaxyproject.github.io/training-material/templates/slides/) +- [General introduction about ](https://galaxyproject.github.io/training-material/templates/slides/) - Slide deck related to the tutorials: - - [Tutorial name](http://galaxyproject.github.io/training-material/templates/slides/tutorial.html) + - [Tutorial name](http://github.con/galaxyproject/training-material/templates/tutorials) # Tutorials diff --git a/topics/variant-analysis/slides/index.html b/topics/variant-analysis/slides/index.html index 9dee9140..f165a680 100644 --- a/topics/variant-analysis/slides/index.html +++ b/topics/variant-analysis/slides/index.html @@ -72,5 +72,5 @@ ### 2 tutorials -- [Introductory tutorial](http://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/exome-seq/tutorial.html) -- [Detailed tutorial](http://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.html) +- [Introductory tutorial](https://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/exome-seq/tutorial.html) +- [Detailed tutorial](https://galaxyproject.github.io/training-material/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.html) From 699d119b02e9eba5db38d92f768d1678fb3be21f Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 15:29:20 +0200 Subject: [PATCH 19/35] use https for training material links --- CONTRIBUTING.md | 6 +++--- shared/slides/project_presentation.html | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5d71e0a1..b54b710a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -31,7 +31,7 @@ If you have any questions, you can reach us using the [Gitter chat](https://gitt # What should I know before I get started? -This repository is a project of unification of the Galaxy training material. You can find more information about this project in this [small presentation](http://galaxyproject.github.io/training-material/shared/slides/project_presentation#/) +This repository is a project of unification of the Galaxy training material. You can find more information about this project in this [small presentation](https://galaxyproject.github.io/training-material/shared/slides/project_presentation#/) By contributing, you agree that we may redistribute your work under [this repository's license](LICENSE.md). @@ -162,7 +162,7 @@ You can then visualize locally ([http://localhost:4000/](http://localhost:4000/) ## How do I add a new topic? 1. Copy the [`templates`](templates) directory, rename it and move it to the [`topics`](topics) -2. Fill the meta information about the topic in the `metadata.yaml` file +2. Fill the meta information about the topic in the `metadata.yaml` file - `name`: name of the topic (same name as the `yml` file and the directory) - `title`: title of the topic - `type`: targeted users (`"use"` or `""`) @@ -359,7 +359,7 @@ Slide notes The first slides (with the title, the requirements,...) are automatically generated using the metadata of the topic. Then the content to fill starts with the introduction. They are then rendered with [`Remark`](https://remarkjs.com/). Template for the `html` files can be found in -[`templates/slides/`](templates/slides/). Once the slides are on the `master` branch, they will be available at `http://galaxyproject.github.io/training-material//slides/.html` +[`templates/slides/`](templates/slides/). Once the slides are on the `master` branch, they will be available at `https://galaxyproject.github.io/training-material//slides/.html` You can also add yourself as contributor for the topic in the `yml` file of the related topic that is in `metadata` directory diff --git a/shared/slides/project_presentation.html b/shared/slides/project_presentation.html index e721bbba..033ca96b 100644 --- a/shared/slides/project_presentation.html +++ b/shared/slides/project_presentation.html @@ -120,7 +120,7 @@ Landing page - [http://galaxyproject.github.io/training-material/ ](http://galaxyproject.github.io/training-material/) + [https://galaxyproject.github.io/training-material/ ](https://galaxyproject.github.io/training-material/) From 6332bbba72934898a35c3639be180b6a820b8f3c Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 16:39:20 +0200 Subject: [PATCH 20/35] re-enable link checking --- .travis.yml | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3572a8b6..41f14890 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ services: before_script: # dump the changed files into a file - - git diff --name-only "${TRAVIS_COMMIT_RANGE/.../..}" > ./CHANGES.list + #- git diff --name-only "${TRAVIS_COMMIT_RANGE/.../..}" > ./CHANGES.list # Spell checking? #- apt-get install -y hunspell pandoc #- wget https://cgit.freedesktop.org/libreoffice/dictionaries/tree/en/en_GB.aff @@ -19,27 +19,20 @@ before_script: - pip install vl pyyaml - vl --version + # patch vl to send user-agent header (some links give error if not set) + - | + sed -i 's|requests = (grequests.head(u, timeout=timeout, verify=False)|requests = (grequests.head(u, timeout=timeout, verify=False, headers={\x27User-Agent\x27: \x27Mozilla\x27})|' /home/travis/virtualenv/python*/lib/python*/site-packages/vl/cli.py + script: - set -e # Check links - - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist http://localhost:8080,bioteam.net,publish.twitter.com {}" + - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist localhost,publish.twitter.com {}" # # Check structure #- ./bin/check_structure.py - # Check docker + # Check docker (this will take way too long for travis, just set up quay.io triggers and monitor status somewhere?) #- ./bin/prepare_docker_checks.py #- | - # export ROOT=$PWD - # while read -r DIR - # do - # echo "$DIR" - # cd "$DIR" - # docker build -f Dockerfile.old . - # cd $ROOT - # echo "" - # done < DOCKER_BUILDS.list - # Once all tutorials use new docker bootstrap script use this - #- | # while read -r DIR # do # echo "$DIR" From 4fd1e5a1a38a05e30fcc8de027de85e54c76b562 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 16:40:01 +0200 Subject: [PATCH 21/35] fix some more links --- CONTRIBUTING.md | 6 +++--- README.md | 14 +++++++------- templates/slides/index.html | 2 +- templates/tutorials/tutorial1/slides.html | 2 +- .../architecture/sync_architecture_from_galaxy.sh | 2 +- topics/dev/tutorials/visualization-generic/tutorial.md | 2 +- .../tutorials/quality-control/slides.html | 2 +- topics/usegalaxy/tutorials/dip/tutorial.md | 2 +- topics/usegalaxy/tutorials/dunovo/tutorial.md | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b54b710a..48d13136 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -230,7 +230,7 @@ The content of a tutorial hands-on is written in Markdown. They are rendered by ``` --- layout: tutorial_slides - topic_name: "Dev-Corner" + topic_name: "dev" tutorial_name: tool_integration logo: "GTN" --- @@ -346,7 +346,7 @@ logo: "GTN" .image-25[![](../images/ecker_2012.jpg)] -[*Ecker et al, Nature, 2012*](http://www.nature.com/nature/journal/v489/n7414/full/489052a.html) +[*Ecker et al, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pubmed/22955614) ??? @@ -377,7 +377,7 @@ Filling tutorial slides are similar a combination of filling introduction slides ``` --- layout: tutorial_slides - topic_name: "Dev-Corner" + topic_name: "dev" tutorial_name: tool_integration logo: "GTN" --- diff --git a/README.md b/README.md index e8baf823..affbeef3 100644 --- a/README.md +++ b/README.md @@ -7,14 +7,14 @@ Training material | topic | features | | :-- | :--: | +| [Galaxy Introduction](topics/introduction) | [:whale:](topics/introduction/docker/)[:book:](topics/introduction/tutorials/) | | [Genome Annotation](Genome-Annotation/) | [:book:](Genome-Annotation/tutorials/general-introduction.md) | -| [RNA-seq](RNA-Seq/) | [:whale:](RNA-Seq/docker/) [:movie_camera:](https://vimeo.com/128268401) [:page_facing_up:](https://usegalaxy.org/u/jeremy/p/galaxy-rna-seq-analysis-exercise) [:book:](RNA-Seq/tutorials/ref_based.md) [:mortar_board:](RNA-Seq/slides/index.html) | -| [ChIP-seq](ChIP-Seq/) | [:whale:](ChIP-Seq/docker/) [:book:](ChIP-Seq/) [:mortar_board:](ChIP-Seq/slides/index.html) | -| [Exome-seq](Exome-Seq/) | [:whale:](Exome-Seq/docker) [:book:](Exome-Seq/tutorials) [:mortar_board:](Exome-Seq/slides/index.html/) | -| [MethylC-seq](MethylC-Seq) | [:book:](MethylC-Seq/tutorials/Methylation-Seq.md) | -| [Galaxy Introduction](Introduction) | [:whale:](Introduction/docker/)[:book:](Introduction/tutorials/) | -| [Data Sources](Dev-Corner/tutorials/data_source_integration.md) | :book: | -| [Galaxy Tool Dev Corner](Dev-Corner) | [:book:](Dev-Corner/readme.md) | +| [Transcriptomics](topics/transcriptomics/) | [:whale:](topics/transcriptomics/docker/) [:movie_camera:](https://vimeo.com/128268401) [:page_facing_up:](https://usegalaxy.org/u/jeremy/p/galaxy-rna-seq-analysis-exercise) [:book:](RNA-Seq/tutorials/ref_based.md) [:mortar_board:](RNA-Seq/slides/index.html) | +| [ChIP-seq](topics/chip-seq/) | [:whale:](topics/chip-seq/docker/) [:book:](topis/chip-seq/) [:mortar_board:](topics/chip-seq/slides/index.html) | +| [Variant Analysis](topics/variant-analysis/) | [:whale:](topics/variant-analysis/docker) [:book:](topics/variant-analysis/tutorials) [:mortar_board:](topics/variant-analysis/slides/index.html/) | +| [Epigenetics](topics/epigenetics/) | [:book:](topics/epigenetics//tutorials/methylation-seq.md) | +| [Data Sources](topics/dev/tutorials/data_source_integration.md) | :book: | +| [Galaxy Tool Dev Corner](topics/dev/) | [:book:](topics/dev/readme.md) | | [Galaxy Docker Project](https://slides.com/bgruening/the-galaxy-docker-project/) | [:book:](https://slides.com/bgruening/the-galaxy-docker-project/) | diff --git a/templates/slides/index.html b/templates/slides/index.html index c6eaec7e..a1ec1676 100644 --- a/templates/slides/index.html +++ b/templates/slides/index.html @@ -26,7 +26,7 @@ ![](../images/RNA_seq_zang2016.png) -[*Zang and Mortazavi, Nature, 2012*](http://www.nature.com/ni/journal/v13/n9/full/ni.2407.html) +[*Zang and Mortazavi, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138050/) --- diff --git a/templates/tutorials/tutorial1/slides.html b/templates/tutorials/tutorial1/slides.html index 887f06a9..988e9d45 100644 --- a/templates/tutorials/tutorial1/slides.html +++ b/templates/tutorials/tutorial1/slides.html @@ -27,7 +27,7 @@ ![](../images/RNA_seq_zang2016.png) -[*Zang and Mortazavi, Nature, 2012*](http://www.nature.com/ni/journal/v13/n9/full/ni.2407.html) +[*Zang and Mortazavi, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138050/) --- diff --git a/topics/dev/tutorials/architecture/sync_architecture_from_galaxy.sh b/topics/dev/tutorials/architecture/sync_architecture_from_galaxy.sh index 4de8b30c..65824ef8 100755 --- a/topics/dev/tutorials/architecture/sync_architecture_from_galaxy.sh +++ b/topics/dev/tutorials/architecture/sync_architecture_from_galaxy.sh @@ -10,7 +10,7 @@ SLIDES_DIR="." cat > "$SLIDES_DIR/architecture.html" <[*Ecker et al, Nature, 2012*](http://www.nature.com/nature/journal/v489/n7414/full/489052a.html) +[*Ecker et al, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pubmed/22955614) --- diff --git a/topics/usegalaxy/tutorials/dip/tutorial.md b/topics/usegalaxy/tutorials/dip/tutorial.md index 25c1f002..f8581a38 100644 --- a/topics/usegalaxy/tutorials/dip/tutorial.md +++ b/topics/usegalaxy/tutorials/dip/tutorial.md @@ -45,7 +45,7 @@ Variant calling is a complex field that was significantly propelled by advances * **Variant calling** - identification of positions where the sequenced sample is different from the reference sequence (or [reference genome graph](https://github.com/vgteam/vg)); * **Genotype calling** - identifying individual's genotype at variable sites. -A typical workflow for variation discovery involves the following steps (e.g., see Nielsen et al. [2011](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)): +A typical workflow for variation discovery involves the following steps (e.g., see Nielsen et al. [2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)): 1. Mapping reads against the reference genome 2. Thresholding BAM datasets by, for example, retaining paired, properly mapped reads diff --git a/topics/usegalaxy/tutorials/dunovo/tutorial.md b/topics/usegalaxy/tutorials/dunovo/tutorial.md index 6233df87..70bb1ff9 100644 --- a/topics/usegalaxy/tutorials/dunovo/tutorial.md +++ b/topics/usegalaxy/tutorials/dunovo/tutorial.md @@ -4,7 +4,7 @@ topic_name: usegalaxy tutorial_name: dunovo --- -This page explains how to perform discovery of low frequency variants from duplex sequencing data. As an example we use the _ABL1_ dataset published by [Schmitt and colleagues](http://www.nature.com/nmeth/journal/v12/n5/full/nmeth.3351.html) (SRA accession [SRR1799908](http://www.ncbi.nlm.nih.gov/sra/?term=SRR1799908)). +This page explains how to perform discovery of low frequency variants from duplex sequencing data. As an example we use the _ABL1_ dataset published by [Schmitt and colleagues](https://www.ncbi.nlm.nih.gov/pubmed/25849638) (SRA accession [SRR1799908](http://www.ncbi.nlm.nih.gov/sra/?term=SRR1799908)). # Background @@ -20,7 +20,7 @@ The computational analysis of DS data (Part `C` in the figure above) produces tw * Single Strand Consensus Sequences (SSCS; panel `iv` in the figure above); * Duplex Consensus Sequences (DCS; panel `v` in the figure above). -The DCSs have the ultimate accuracy, yet the SSCSs can also be very useful when ampliconic DNA is used as an input to a DS experiment. Let us illustrate the utility of SSCSs with the following example. Suppose one is interested in quantifying variants in a virus that has a very low titer in body fluids. Since DS procedure requires a substantial amount of starting DNA (between [between 0.2 and 3 micrograms](http://nature.com/nprot/journal/v9/n11/full/nprot.2014.170.html)) the virus needs to be enriched. This can be done, for example, with a PCR designed to amplify the entire genome of the virus. Yet the problem is that during the amplification heterologous strands will almost certainly realign to some extent forming hetoroduplex molecules: +The DCSs have the ultimate accuracy, yet the SSCSs can also be very useful when ampliconic DNA is used as an input to a DS experiment. Let us illustrate the utility of SSCSs with the following example. Suppose one is interested in quantifying variants in a virus that has a very low titer in body fluids. Since DS procedure requires a substantial amount of starting DNA (between [between 0.2 and 3 micrograms](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4271547/)) the virus needs to be enriched. This can be done, for example, with a PCR designed to amplify the entire genome of the virus. Yet the problem is that during the amplification heterologous strands will almost certainly realign to some extent forming hetoroduplex molecules: >![hd](../images/het.png) > From 57825c2865e1ebd10f184bbc28e6c483dfa6321d Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 16:40:46 +0200 Subject: [PATCH 22/35] vimeo always returns 403s in travis? --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 41f14890..6218a377 100644 --- a/.travis.yml +++ b/.travis.yml @@ -26,7 +26,7 @@ before_script: script: - set -e # Check links - - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist localhost,publish.twitter.com {}" + - find . \( -name "*.md" -o -name "*.html" \) -not -path "./assets/reveal.js/*" | xargs -L 1 -I '{}' sh -c "echo {}; vl -t 5 -s 1000 --allow-codes 405 --whitelist localhost,publish.twitter.com,linkedin.com,vimeo.com {}" # # Check structure #- ./bin/check_structure.py From 1b96a71c402c1a7404344a4d6dc7917f32b4f777 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:08:31 +0200 Subject: [PATCH 23/35] replace nature links with PMC --- topics/chip-seq/slides/index.html | 4 ++-- topics/usegalaxy/slides/index.html | 4 ++-- topics/usegalaxy/slides/tutorial.html | 4 ++-- topics/usegalaxy/tutorials/dip/tutorial.md | 2 +- topics/usegalaxy/tutorials/dunovo/tutorial.md | 4 ++-- topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md | 18 +++++++++--------- .../tutorials/diploid-variant-calling/tutorial.md | 6 +++--- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/topics/chip-seq/slides/index.html b/topics/chip-seq/slides/index.html index 8a7e37bc..c0f6b677 100644 --- a/topics/chip-seq/slides/index.html +++ b/topics/chip-seq/slides/index.html @@ -12,7 +12,7 @@ ![](../images/ecker_2012.jpg) -[*Ecker et al, Nature, 2012*](http://www.nature.com/nature/journal/v489/n7414/full/489052a.html) +[*Ecker et al, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pubmed/22955614) --- @@ -33,4 +33,4 @@ ![](../images/kidder_2011.jpg) -[*Kidder et al, Nature Immunology, 2011*](http://www.nature.com/ni/journal/v12/n10/full/ni.2117.html) +[*Kidder et al, Nature Immunology, 2011*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3541830/) diff --git a/topics/usegalaxy/slides/index.html b/topics/usegalaxy/slides/index.html index 2c2df12e..f49bebd0 100644 --- a/topics/usegalaxy/slides/index.html +++ b/topics/usegalaxy/slides/index.html @@ -10,7 +10,7 @@ ### Topic -Blabla +Blabla - Blabla - Blabla @@ -26,7 +26,7 @@ ![](../images/RNA_seq_zang2016.png) -[*Zang and Mortazavi, Nature, 2012*](http://www.nature.com/ni/journal/v13/n9/full/ni.2407.html) +[*Zang and Mortazavi, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138050/) --- diff --git a/topics/usegalaxy/slides/tutorial.html b/topics/usegalaxy/slides/tutorial.html index 2d368109..195fb511 100644 --- a/topics/usegalaxy/slides/tutorial.html +++ b/topics/usegalaxy/slides/tutorial.html @@ -11,7 +11,7 @@ ### tutorial -Blabla +Blabla - Blabla - Blabla @@ -27,7 +27,7 @@ ![](../images/RNA_seq_zang2016.png) -[*Zang and Mortazavi, Nature, 2012*](http://www.nature.com/ni/journal/v13/n9/full/ni.2407.html) +[*Zang and Mortazavi, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138050/) --- diff --git a/topics/usegalaxy/tutorials/dip/tutorial.md b/topics/usegalaxy/tutorials/dip/tutorial.md index f8581a38..e13a6251 100644 --- a/topics/usegalaxy/tutorials/dip/tutorial.md +++ b/topics/usegalaxy/tutorials/dip/tutorial.md @@ -128,7 +128,7 @@ In the simplest case we can estimate these as follows: Suppose $S_i$ is a base in read $i$ corresponding to a genome position with genotype $G$. The probability of seeing $S_i$ given $G$, or $P(S_i|G)$, is given by the quality score of $S_i$ (the quality scores are given by base calling software and reported as [phred scores](https://en.wikipedia.org/wiki/Phred_quality_score)). Thus the genotype likelihood $P(S|G)$ is the product of $P(S_i|G)$ over all $i$. In reality however there are many other sources of uncertainty (in addition to base qualities) that are incorporated in the calculation of data likelihoods including NGS technology-related issues, dependency of error rates on substitution type (e.g., transitions versus transversions), sequencing context etc... ### $P(G)$ - a single sample case -One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al. 2011](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. +One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. ### $P(G)$ - a multi-sample case Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al. 2011](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% ($p = 0.01$; $q = 1 - p = 0.99$), then from the HWE we have: diff --git a/topics/usegalaxy/tutorials/dunovo/tutorial.md b/topics/usegalaxy/tutorials/dunovo/tutorial.md index 70bb1ff9..46adf7e4 100644 --- a/topics/usegalaxy/tutorials/dunovo/tutorial.md +++ b/topics/usegalaxy/tutorials/dunovo/tutorial.md @@ -52,7 +52,7 @@ The starting point of the analyses are sequencing reads (usually in [fastq](http ## Getting data in and assessing quality -We uploaded [Schmitt:2015](http://www.nature.com/nmeth/journal/v12/n5/full/nmeth.3351.html)) data directly from SRA as shown in [this screencast](https://vimeo.com/121187220). This created two datasets in our galaxy history: one for forward reads and one for reverse. We then evaluated the quality of the data by running FastQC on both datasets (forward and reverse) to obtain the following plots: +We uploaded [Schmitt:2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4414912/)) data directly from SRA as shown in [this screencast](https://vimeo.com/121187220). This created two datasets in our galaxy history: one for forward reads and one for reverse. We then evaluated the quality of the data by running FastQC on both datasets (forward and reverse) to obtain the following plots: | :--|:--- @@ -165,7 +165,7 @@ bwa-mem | 130,880,141 | A | G | 0.479 | We can see that results of both mappers agree very well. The reason we see these numbers grouped by mappers is because we have set the readgroups while [mapping](#align-against-genome-with-bwa-and-bwa-mem). -The polymorphism we are interested in (and the one reported by [Schmitt:2015] (http://www.nature.com/nmeth/journal/v12/n5/full/nmeth.3351.html)) is at the position 130,872,141 and has a frequency of 1.3%. The other site (position 130,880,141) is a known common variant [rs2227985](http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs2227985), which is heterozygous in this sample. +The polymorphism we are interested in (and the one reported by [Schmitt:2015] (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4414912/)) is at the position 130,872,141 and has a frequency of 1.3%. The other site (position 130,880,141) is a known common variant [rs2227985](http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs2227985), which is heterozygous in this sample. # Analysis of single strand consensus data diff --git a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md index 33e909ad..b8c991f0 100644 --- a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md +++ b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md @@ -28,7 +28,7 @@ There is a variety of ways in which RNA is treated during its conversion to cDNA * Second strand synthesis using DNA polymerase; * Library preparation for sequencing. -In listing these basic steps we are ignoring a vast amount of details such as, for example, normalization strategies and procedures needed to deal with rare RNAs or degraded samples (see [Adiconis:2013](http://nature.com/nmeth/journal/v10/n7/full/nmeth.2483.html)). Yet, there are two important experimental considerations that would effect the ways in which one analyses data and interprets the results. These are: +In listing these basic steps we are ignoring a vast amount of details such as, for example, normalization strategies and procedures needed to deal with rare RNAs or degraded samples (see [Adiconis:2013](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3821180/)). Yet, there are two important experimental considerations that would effect the ways in which one analyses data and interprets the results. These are: * Priming for the first cDNA strand synthesis; * Stranded versus Non-stranded libraries. @@ -53,13 +53,13 @@ RNAs that are typically targeted in RNAseq experiments are single stranded (e.g. > >**Relationship between DNA and RNA orientation** -During a typical RNAseq experiment the information about strandedness is lost after both strands of cDNA are synthesized, size selected, and converted into sequencing library. However, this information can be quite useful for various aspects of RNAseq analysis such as transcript reconstruction and quantification. There is a number of methods for creating so called *stranded* RNAseq libraries that preserve the strand information (for an excellent overview see Levin et al. [2010](http://www.nature.com/nmeth/journal/v7/n9/full/nmeth.1491.html)): +During a typical RNAseq experiment the information about strandedness is lost after both strands of cDNA are synthesized, size selected, and converted into sequencing library. However, this information can be quite useful for various aspects of RNAseq analysis such as transcript reconstruction and quantification. There is a number of methods for creating so called *stranded* RNAseq libraries that preserve the strand information (for an excellent overview see Levin et al. [2010](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3005310/)): ->[![](../images/stranded_protocols.png)](http://www.nature.com/nmeth/journal/v7/n9/fig_tab/nmeth.1491_F1.html) +>![](../images/stranded_protocols.png) > >**Generation of stranded RNAseq libraries**
->Different types of stranded library generation protocols from [Levin:2010](http://www.nature.com/nmeth/journal/v7/n9/full/nmeth.1491.html) +>Different types of stranded library generation protocols from [Levin:2010](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3005310/) Depending on the approach and whether one performs single- or paired-end sequencing there are multiple possibilities on how to interpret the results of mapping of these reads onto genome/transcriptome: @@ -134,13 +134,13 @@ After sequencing is performed you have a collection of sequencing reads for each >**TopHat has been subsequently improved with the development of TopHat2**
>Image from [Kim:2012](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) summarizes steps involved in aligning of RNAseq reads with TopHat2 -To further optimize and speed up spliced read alignment Kim at al. [2015](http://www.nature.com/nmeth/journal/v12/n4/full/nmeth.3317.html) developed [HISAT](http://ccb.jhu.edu/software/hisat2/index.shtml). It uses a set of [FM-indices](https://en.wikipedia.org/wiki/FM-index) consisting one global genome-wide index and a collection of ~48,000 local overlapping 42 kb indices (~55,000 56 kb indices in HiSat2). This allows to find initial seed locations for potential read alignments in the genome using global index and to rapidly refine these alignments using a corresponding local index: +To further optimize and speed up spliced read alignment Kim at al. [2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) developed [HISAT](http://ccb.jhu.edu/software/hisat2/index.shtml). It uses a set of [FM-indices](https://en.wikipedia.org/wiki/FM-index) consisting one global genome-wide index and a collection of ~48,000 local overlapping 42 kb indices (~55,000 56 kb indices in HiSat2). This allows to find initial seed locations for potential read alignments in the genome using global index and to rapidly refine these alignments using a corresponding local index: >![](../images/hisat.png) > >**Hierarchical Graph FM index in HiSat/HiSat2**
->A part of the read (blue arrow) is first mapped to the genome using the global FM index. The HiSat then tries to extend the alignment directly utilizing the genome sequence (violet arrow). In (**a**) it succeeds and this read aligned as it completely resides within an exon. In (**b**) the extension hits a mismatch. Now HiSat takes advantage of the local FM index overlapping this location to find the appropriate matting for the remainder of this read (green arrow). The (**c**) shows a combination these two strategies: the beginning of the read is mapped using global FM index (blue arrow), extended until it reaches the end of the exon (violet arrow), mapped using local FM index (green arrow) and extended again (violet arrow). Image from [Kim:2015](http://www.nature.com/nmeth/journal/v12/n4/full/nmeth.3317.html) +>A part of the read (blue arrow) is first mapped to the genome using the global FM index. The HiSat then tries to extend the alignment directly utilizing the genome sequence (violet arrow). In (**a**) it succeeds and this read aligned as it completely resides within an exon. In (**b**) the extension hits a mismatch. Now HiSat takes advantage of the local FM index overlapping this location to find the appropriate matting for the remainder of this read (green arrow). The (**c**) shows a combination these two strategies: the beginning of the read is mapped using global FM index (blue arrow), extended until it reaches the end of the exon (violet arrow), mapped using local FM index (green arrow) and extended again (violet arrow). Image from [Kim:2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) ### STAR mapper @@ -171,7 +171,7 @@ Based on these results [Cufflinks](http://cole-trapnell-lab.github.io/cufflinks/ >![](../images/stringtie1.png) > >**StringTie workflow**
->Image from [Pertea:2015](http://www.nature.com/nbt/journal/v33/n3/full/nbt.3122.html) +>Image from [Pertea:2015](StringTie enables improved reconstruction of a transcriptome from RNA-seq reads) In essence StringTie builds an alternative splice graph from overlapping reads in a given locus. In such a graph nodes correspond to exons (or, rather, contiguous regions of genome covered by reads; colored regions on the figure above), while edges are represented by reads connecting these exons. Next, it identifies a path within the splice graph that has the highest weight (largest number of reads on edges). Such path would correspond to an assembled transcript at this iteration of the algorithm. Because the edge weight is equal to the number of the reads StringTie estimates the coverage level for this transcript (see below) which can be used to estimate the transcript's abundance. Reads that are associated with the transcript that was just assembled are then removed and the graph is updated to perform the next iteration of the algorithm. @@ -225,7 +225,7 @@ StringTie, which performs assembly and quantification simultaneously converts sp >![](../images/stringtie2.png) > >**StringTie flow network**
->Here each exon node from the splice graph is split into *in* and *out* nodes connected with an edge weighted by the number of reads corresponding to that exon. For example, the first exon is covered by seven reads and so the edge between 1-in and 1-out has a weight of 7. Expression level would correspond to the maximum flow through a path representing a given transcript. Image from [Pertea:2015](http://www.nature.com/nbt/journal/v33/n3/full/nbt.3122.html) +>Here each exon node from the splice graph is split into *in* and *out* nodes connected with an edge weighted by the number of reads corresponding to that exon. For example, the first exon is covered by seven reads and so the edge between 1-in and 1-out has a weight of 7. Expression level would correspond to the maximum flow through a path representing a given transcript. Image from [Pertea:2015](StringTie enables improved reconstruction of a transcriptome from RNA-seq reads) #### Expectation Maximization @@ -303,7 +303,7 @@ to test for differential gene expression. ### The data -In this example we will use a downsampled version of simulated *Drosophila melanogaster* RNA-seq data used by Trapnell et al. [2012](http://www.nature.com/nprot/journal/v7/n3/full/nprot.2012.016.html). These include two conditions (**C1** and **C2**), each containing three replicates (**R1**, **R2**, and **R3**) sequenced as a paired end library. Thus in total there are 12 fastq datasets. +In this example we will use a downsampled version of simulated *Drosophila melanogaster* RNA-seq data used by Trapnell et al. [2012](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3334321/). These include two conditions (**C1** and **C2**), each containing three replicates (**R1**, **R2**, and **R3**) sequenced as a paired end library. Thus in total there are 12 fastq datasets. Here is what to do to load the data: diff --git a/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md b/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md index 3b7dff1a..24a07d21 100644 --- a/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md +++ b/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md @@ -13,7 +13,7 @@ Variant calling is a complex field that was significantly propelled by advances * **Variant calling** - identification of positions where the sequenced sample is different from the reference sequence (or [reference genome graph](https://github.com/vgteam/vg)); * **Genotype calling** - identifying individual's genotype at variable sites. -A typical workflow for variation discovery involves the following steps (e.g., see Nielsen et al. [2011](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)): +A typical workflow for variation discovery involves the following steps (e.g., see Nielsen et al. [2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)): 1. Mapping reads against the reference genome; 2. Thresholding BAM datasets by, for example, retaining paired, properly mapped reads; @@ -102,11 +102,11 @@ Suppose *Ri* is a base in read *i* corresponding to a genome position #### *P(G)* - a single sample case -One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al.](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. +One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. #### *P(G)* - a multi-sample case -Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al.](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% (*p* = 0.01; *q* = 1 - *p* = 0.99), then from the HWE we have: +Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% (*p* = 0.01; *q* = 1 - *p* = 0.99), then from the HWE we have: | AA (*p2*) | AT (2*pq*) | TT (*q2*)| |---------|---------|--------| From 5bcc30d9451c2b4974f98204cae4d36c4b4bfbf4 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:21:17 +0200 Subject: [PATCH 24/35] replace nature links with PMC --- topics/transcriptomics/README.md | 4 ++-- topics/transcriptomics/slides/index.html | 6 +++--- topics/usegalaxy/tutorials/dip/tutorial.md | 2 +- topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/topics/transcriptomics/README.md b/topics/transcriptomics/README.md index fce3d48f..f980aa13 100644 --- a/topics/transcriptomics/README.md +++ b/topics/transcriptomics/README.md @@ -36,14 +36,14 @@ It will launch a flavored Galaxy instance available on ## Papers -**Shirley Pepke et al:** [Computation for ChIP-seq and RNA-seq studies](http://www.nature.com/nmeth/journal/v6/n11s/full/nmeth.1371.html) +**Shirley Pepke et al:** [Computation for ChIP-seq and RNA-seq studies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4121056/) **Paul L. Auer & R. W. Doerge:** [Statistical Design and Analysis of RNA Sequencing Data](http://www.stat.purdue.edu/~doerge/BIOINFORM.D/SPRING10/auer_doerge_genetics_2010.pdf) DOI: 10.1534/genetics.110.114983 > Insights into proper planning of your RNA-seq run! To read before any RNA-seq experiment! -**Ian Korf:**[Genomics: the state of the art in RNA-seq analysis](http://www.nature.com/nmeth/journal/v10/n12/full/nmeth.2735.html) +**Ian Korf:**[Genomics: the state of the art in RNA-seq analysis](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4461013/) > A refreshingly honest view on the non-trivial aspects of RNA-seq analysis diff --git a/topics/transcriptomics/slides/index.html b/topics/transcriptomics/slides/index.html index 529a3847..76743ed1 100644 --- a/topics/transcriptomics/slides/index.html +++ b/topics/transcriptomics/slides/index.html @@ -26,7 +26,7 @@ ![](../images/RNA_seq_zang2016.png) -[*Zang and Mortazavi, Nature, 2012*](http://www.nature.com/ni/journal/v13/n9/full/ni.2407.html) +[*Zang and Mortazavi, Nature, 2012*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4138050/) --- @@ -34,7 +34,7 @@ ![](../images/korf_2013.jpg) -[*Korf, Nat Met, 2013*](http://www.nature.com/nmeth/journal/v10/n12/full/nmeth.2735.html) +[*Korf, Nat Met, 2013*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4461013/) --- @@ -97,7 +97,7 @@ ![](../images/pepke_2009.jpg) -[*Pepke et al, Nat Met, 2009*](http://www.nature.com/nmeth/journal/v6/n11s/full/nmeth.1371.html) +[*Pepke et al, Nat Met, 2009*](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4077321/) --- diff --git a/topics/usegalaxy/tutorials/dip/tutorial.md b/topics/usegalaxy/tutorials/dip/tutorial.md index e13a6251..93276f36 100644 --- a/topics/usegalaxy/tutorials/dip/tutorial.md +++ b/topics/usegalaxy/tutorials/dip/tutorial.md @@ -131,7 +131,7 @@ Suppose $S_i$ is a base in read $i$ corresponding to a genome position with geno One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. ### $P(G)$ - a multi-sample case -Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al. 2011](http://www.nature.com/nrg/journal/v12/n6/full/nrg2986.html)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% ($p = 0.01$; $q = 1 - p = 0.99$), then from the HWE we have: +Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% ($p = 0.01$; $q = 1 - p = 0.99$), then from the HWE we have: | | | | |---------|---------|--------| diff --git a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md index b8c991f0..80ee6884 100644 --- a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md +++ b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md @@ -190,7 +190,7 @@ To associate reads with transcripts they (the reads) need to be aligned to the t >![](../images/sailfish.png) > >**Assigning reads to transcripts: Sailfish**
->Sailfish indexes input transcriptome for a fixed *k*-mer length and compares *k*-mers derived from RNAseq reads against this index. Image from [Patro:2014](http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html) +>Sailfish indexes input transcriptome for a fixed *k*-mer length and compares *k*-mers derived from RNAseq reads against this index. Image from [Patro:2014](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4077321/) The current version of Sailfish uses [quasi-alignment](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) to extend exact matches found with *k*-mers: From 6be011c1c00e2d06bda7c0124dadf0a0f02469c6 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:28:16 +0200 Subject: [PATCH 25/35] update README links --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index affbeef3..156e4273 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,12 @@ Training material | topic | features | | :-- | :--: | | [Galaxy Introduction](topics/introduction) | [:whale:](topics/introduction/docker/)[:book:](topics/introduction/tutorials/) | -| [Genome Annotation](Genome-Annotation/) | [:book:](Genome-Annotation/tutorials/general-introduction.md) | | [Transcriptomics](topics/transcriptomics/) | [:whale:](topics/transcriptomics/docker/) [:movie_camera:](https://vimeo.com/128268401) [:page_facing_up:](https://usegalaxy.org/u/jeremy/p/galaxy-rna-seq-analysis-exercise) [:book:](RNA-Seq/tutorials/ref_based.md) [:mortar_board:](RNA-Seq/slides/index.html) | | [ChIP-seq](topics/chip-seq/) | [:whale:](topics/chip-seq/docker/) [:book:](topis/chip-seq/) [:mortar_board:](topics/chip-seq/slides/index.html) | | [Variant Analysis](topics/variant-analysis/) | [:whale:](topics/variant-analysis/docker) [:book:](topics/variant-analysis/tutorials) [:mortar_board:](topics/variant-analysis/slides/index.html/) | | [Epigenetics](topics/epigenetics/) | [:book:](topics/epigenetics//tutorials/methylation-seq.md) | -| [Data Sources](topics/dev/tutorials/data_source_integration.md) | :book: | -| [Galaxy Tool Dev Corner](topics/dev/) | [:book:](topics/dev/readme.md) | +| [Data Sources](topics/dev/tutorials/data-source-integration/tutorial.md) | :book: | +| [Galaxy Tool Development](topics/dev/) | [:book:](topics/dev/readme.md) | | [Galaxy Docker Project](https://slides.com/bgruening/the-galaxy-docker-project/) | [:book:](https://slides.com/bgruening/the-galaxy-docker-project/) | From ee89d31ab0c85e19d44c31c138891b66c79328c6 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:38:16 +0200 Subject: [PATCH 26/35] fix agenda styling --- CONTRIBUTING.md | 2 +- templates/tutorials/tutorial1/tutorial.md | 2 +- topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md | 2 +- topics/assembly/tutorials/general-introduction/tutorial.md | 2 +- topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md | 2 +- topics/proteomics/tutorials/database-handling/tutorial.md | 2 +- topics/proteomics/tutorials/labelfree-vs-labelled/tutorial.md | 2 +- topics/proteomics/tutorials/protein-id-sg-ps/tutorial.md | 2 +- topics/sequence-analysis/tutorials/mapping/tutorial.md | 2 +- topics/sequence-analysis/tutorials/quality-control/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-content/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-docker/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-metadata/tutorial.md | 2 +- topics/training/tutorials/create-new-tutorial-tours/tutorial.md | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 48d13136..e4e7d081 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -304,7 +304,7 @@ To improve the learning experience, we strongly recommend you to: > 1. [Pretreatments](#pretreatments) > 2. [Mapping](#mapping) > 3. [Analysis of the differential expression](#analysis-of-the-differential-expression) - > {: .agenda} + {: .agenda} ``` ![](shared/images/tutorial_agenda_box.png) diff --git a/templates/tutorials/tutorial1/tutorial.md b/templates/tutorials/tutorial1/tutorial.md index bf591043..3e11524a 100644 --- a/templates/tutorials/tutorial1/tutorial.md +++ b/templates/tutorials/tutorial1/tutorial.md @@ -15,7 +15,7 @@ General introduction about the topic and then an introduction of the tutorial (t > 1. [Pretreatments](#pretreatments) > 2. [Mapping](#mapping) > 3. [Analysis of the differential expression](#analysis-of-the-differential-expression) -> {: .agenda} +{: .agenda} # Part 1 diff --git a/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md b/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md index e2e9105e..a5c55c82 100644 --- a/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md +++ b/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md @@ -20,7 +20,7 @@ SPAdes is a de novo genome assembler written by Pavel Pevzner's group in St. Pet > 1. [Get the data](#get-the-data) > 2. [Assemble with the Velvet Optimiser](#assemble-with-the-velvet-optimiser) > 3. [Assemble with SPAdes](#assemble-with-spades) -> {: .agenda} +{: .agenda} # Get the data diff --git a/topics/assembly/tutorials/general-introduction/tutorial.md b/topics/assembly/tutorials/general-introduction/tutorial.md index 6417365c..2c987085 100644 --- a/topics/assembly/tutorials/general-introduction/tutorial.md +++ b/topics/assembly/tutorials/general-introduction/tutorial.md @@ -19,7 +19,7 @@ In this activity, we will perform a *de novo* assembly of a short read set using > 3. [Assemble reads with Velvet](#assemble-reads-with-velvet) > 4. [Collect some statistics on the contigs](#collect-some-statistics-on-the-contigs) > 5. [Discussion](#discussion) -> {: .agenda} +{: .agenda} # Get the data diff --git a/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md b/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md index 03ef1f99..a7eccd99 100644 --- a/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md +++ b/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md @@ -41,7 +41,7 @@ Because of the long processing time for the large original files, we have downsa > 7. [Step 7: Inspection of Tal1 peaks](#step-7-inspection-of-peaks-and-aligned-data) > 8. [Step 8: Identifying unique/common Tal1 peaks](#step-8-identifying-unique-and-common-tal1-peaks-between-states) > 9. [Additional optional analyses](#additional-optional-analyses) -> {: .agenda} +{: .agenda} # Step 1: Quality control diff --git a/topics/proteomics/tutorials/database-handling/tutorial.md b/topics/proteomics/tutorials/database-handling/tutorial.md index 64c80524..79a377fb 100644 --- a/topics/proteomics/tutorials/database-handling/tutorial.md +++ b/topics/proteomics/tutorials/database-handling/tutorial.md @@ -16,7 +16,7 @@ Identifying peptides in proteomic datasets is commonly done by using search engi > 2. [Contaminant Databases](#contaminant-databases) > 3. [Merging Databases](#merging-databases) > 4. [Creating Decoy Databases](#creating-decoy-databases) -> {: .agenda} +{: .agenda} # Loading a Search Database diff --git a/topics/proteomics/tutorials/labelfree-vs-labelled/tutorial.md b/topics/proteomics/tutorials/labelfree-vs-labelled/tutorial.md index 67230e7c..3c0af520 100644 --- a/topics/proteomics/tutorials/labelfree-vs-labelled/tutorial.md +++ b/topics/proteomics/tutorials/labelfree-vs-labelled/tutorial.md @@ -18,7 +18,7 @@ A basic overview of different quantitation techniques is shown below ([original > > 1. [Drawbacks and Benefits of Labelled Quantitation](#drawbacks-and-benefits-of-labelled-quantitation) > 2. [Guideline: How to Choose Your Technique](#guideline-how-to-choose-your-technique) -> {: .agenda} +{: .agenda} # Drawbacks and Benefits of Labelled Quantitation diff --git a/topics/proteomics/tutorials/protein-id-sg-ps/tutorial.md b/topics/proteomics/tutorials/protein-id-sg-ps/tutorial.md index f9c279a4..b6fd28e2 100644 --- a/topics/proteomics/tutorials/protein-id-sg-ps/tutorial.md +++ b/topics/proteomics/tutorials/protein-id-sg-ps/tutorial.md @@ -35,7 +35,7 @@ you can use the constructed database before the **DecoyDatabase** :wrench: step. > 2. [Peptide and Protein Identification](#peptide-and-protein-identification) > 4. [Analysis of Contaminants](#analysis-of-contaminants) > 5. [Peptide and Protein Evaluation](#evaluation-of-peptide-and-protein-ids) -> {: .agenda} +{: .agenda} # Preparing raw data diff --git a/topics/sequence-analysis/tutorials/mapping/tutorial.md b/topics/sequence-analysis/tutorials/mapping/tutorial.md index 1f937dc8..847da5c9 100644 --- a/topics/sequence-analysis/tutorials/mapping/tutorial.md +++ b/topics/sequence-analysis/tutorials/mapping/tutorial.md @@ -14,7 +14,7 @@ In the following we will process a dataset with a mapper, 'Bowtie2', and we will > > 1. [Map the data](#mapping) > 3. [Visualize the mapped data](#visualization) -> {: .agenda} +{: .agenda} # Mapping > ### :pencil2: Hands-on: Mapping with Bowtie2 diff --git a/topics/sequence-analysis/tutorials/quality-control/tutorial.md b/topics/sequence-analysis/tutorials/quality-control/tutorial.md index dcba2f6d..2494669a 100644 --- a/topics/sequence-analysis/tutorials/quality-control/tutorial.md +++ b/topics/sequence-analysis/tutorials/quality-control/tutorial.md @@ -17,7 +17,7 @@ The quality control of the sequences right after sequencing is then an essential > 1. [Sequence dataset importing](#sequence-dataset-importing) > 2. [Quality checking of the sequences](#quality-checking-of-the-sequences) > 3. [Improvement of the quality of the sequences](#improvement-of-the-quality-of-the-sequences) -> {: .agenda} +{: .agenda} # Sequence dataset importing diff --git a/topics/training/tutorials/create-new-tutorial-content/tutorial.md b/topics/training/tutorials/create-new-tutorial-content/tutorial.md index 23e4a77d..7ad6b43b 100644 --- a/topics/training/tutorials/create-new-tutorial-content/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-content/tutorial.md @@ -31,7 +31,7 @@ In this tutorial, you will learn how to write your first tutorial in markdown an > 1. [Creating Interactive Galaxy Tours](../create-new-tutorial-tours/tutorial.html) > 1. [Building a Docker flavor](../create-new-tutorial-docker/tutorial.html) > 1. [Submitting the new tutorial to the GitHub repository](../../../dev/tutorials/github-contribution/slides.html) -> {: .agenda} +{: .agenda} # Setting up a new tutorial diff --git a/topics/training/tutorials/create-new-tutorial-docker/tutorial.md b/topics/training/tutorials/create-new-tutorial-docker/tutorial.md index b8d3ebfc..26c0d45a 100644 --- a/topics/training/tutorials/create-new-tutorial-docker/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-docker/tutorial.md @@ -30,7 +30,7 @@ In this tutorial, you will learn how to create a virtualised Galaxy instance, ba > 1. [Creating Interactive Galaxy Tours](../create-new-tutorial-tours/tutorial.html) > 1. [Building a Docker flavor](../create-new-tutorial-docker/tutorial.html) > 1. [Submitting the new tutorial to the GitHub repository](../../../dev/tutorials/github-contribution/slides.html) -> {: .agenda} +{: .agenda} # Building a Galaxy instance specifically for your training diff --git a/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md b/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md index 42097788..064a2330 100644 --- a/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-jekyll/tutorial.md @@ -31,7 +31,7 @@ In this tutorial, you will learn how to run a local instance of the GTN webiste > 1. [Creating Interactive Galaxy Tours](../create-new-tutorial-tours/tutorial.html) > 1. [Building a Docker flavor](../create-new-tutorial-docker/tutorial.html) > 1. [Submitting the new tutorial to the GitHub repository](../../../dev/tutorials/github-contribution/slides.html) -> {: .agenda} +{: .agenda} # Checking the website generation diff --git a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md index 734c5aca..8ef707d2 100644 --- a/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-metadata/tutorial.md @@ -30,7 +30,7 @@ In this tutorial, you will learn how to annotate your training material with a l > 1. [Creating Interactive Galaxy Tours](../create-new-tutorial-tours/tutorial.html) > 1. [Building a Docker flavor](../create-new-tutorial-docker/tutorial.html) > 1. [Submitting the new tutorial to the GitHub repository](../../../dev/tutorials/github-contribution/slides.html) -> {: .agenda} +{: .agenda} # Filling the metadata diff --git a/topics/training/tutorials/create-new-tutorial-tours/tutorial.md b/topics/training/tutorials/create-new-tutorial-tours/tutorial.md index f325a11f..012dcd4c 100644 --- a/topics/training/tutorials/create-new-tutorial-tours/tutorial.md +++ b/topics/training/tutorials/create-new-tutorial-tours/tutorial.md @@ -29,7 +29,7 @@ In this tutorial, you will understand how to design and develop a new tutorial f > 1. [Creating Interactive Galaxy Tours](../create-new-tutorial-tours/tutorial.html) > 1. [Building a Docker flavor](../create-new-tutorial-docker/tutorial.html) > 1. [Submitting the new tutorial to the GitHub repository](../../../dev/tutorials/github-contribution/slides.html) -> {: .agenda} +{: .agenda} # A Galaxy Interactive Tour From 1ad12c5f27df669e60865bed25e3748bb7fb7dee Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:41:17 +0200 Subject: [PATCH 27/35] update template for requirements links --- _layouts/tutorial_hands_on.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_layouts/tutorial_hands_on.html b/_layouts/tutorial_hands_on.html index 29c2307a..ff13dd08 100644 --- a/_layouts/tutorial_hands_on.html +++ b/_layouts/tutorial_hands_on.html @@ -88,7 +88,7 @@ {% for requirement in topic.requirements %}
  • {% if requirement.type == "internal" %} - {{ requirement.title }} + {{ requirement.title }} {% elsif requirement.type == "external" %} {{ requirement.title }} {% endif %} From 215a241752f938c97296495e930bc57db3a0c194 Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:47:33 +0200 Subject: [PATCH 28/35] fix image links in usegalaxy tutorials --- topics/usegalaxy/tutorials/collections/tutorial.md | 52 +++++++-------- topics/usegalaxy/tutorials/dip/tutorial.md | 38 +++++------ topics/usegalaxy/tutorials/dunovo/tutorial.md | 34 +++++----- topics/usegalaxy/tutorials/ngs/tutorial.md | 28 ++++---- topics/usegalaxy/tutorials/non-dip/tutorial.md | 58 ++++++++--------- topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md | 76 +++++++++++----------- 6 files changed, 143 insertions(+), 143 deletions(-) diff --git a/topics/usegalaxy/tutorials/collections/tutorial.md b/topics/usegalaxy/tutorials/collections/tutorial.md index fd04d6a1..06239d6c 100644 --- a/topics/usegalaxy/tutorials/collections/tutorial.md +++ b/topics/usegalaxy/tutorials/collections/tutorial.md @@ -24,65 +24,65 @@ These datasets represent genomic DNA (enriched for mitochondria via a long range Right click (or Ctrl-click) on [this link](https://usegalaxy.org/library/list#folders/Fab5f788f07073c11) to open a new browser window and position this window side-by-side with the window displaying this tutorial. You will see this: -![](../images/grab_data.png) +![](../../images/grab_data.png) {: .img-responsive} Select all datasets and click **to History** button. This will import all datasets into a history. Follow the direction and will see a screen like this: -![manyDatasets](../images/manyDatasets.png) +![manyDatasets](../../images/manyDatasets.png) {: .img-responsive} ## Creating a paired dataset collection -Now click the checkbox in and you will see your history changing like this: +Now click the checkbox in ![](../../images/historyItemControls.png) and you will see your history changing like this: -![historyWithCheckboxes](../images/historyWithCheckboxes.png) +![historyWithCheckboxes](../../images/historyWithCheckboxes.png) {: .img-responsive} Let's click **All**, which will select all datasets in the history, then click **For all selected...** and finally select **Build List of Dataset Pairs** from the following menu: -![buildPairs](../images/buildPairs.png) +![buildPairs](../../images/buildPairs.png) {: .img-responsive} The following wizard will appear: -![collectionCreation](../images/collectionCreation.png) +![collectionCreation](../../images/collectionCreation.png) {: .img-responsive} In this case Galaxy automatically assigned pairs using the `_1` and `_2` endings of dataset names. Let's however pretend that this did not happen. Click on **Unpair all** (highlighted in red in the figure above) link and then on **Clear** link (highlighted in blue in the figure above). The interface will change into its virgin state: -![collectionCreationClean](../images/collectionCreationClean.png) +![collectionCreationClean](../../images/collectionCreationClean.png) {: .img-responsive} Hopefully you remember that we have paired-end data in this scenario. Datasets containing the first (forward) and the second (reverse) read are differentiated by having `_1` and `_2` in the filename. We can use this feature in dataset collection wizard to pair our datasets. Type `_1` in the left **Filter this list** text box and `_2` in the right: -![1and2](../images/1and2.png) +![1and2](../../images/1and2.png) {: .img-responsive} You will see that the dataset collection wizard will automatically filter lists on each side of the interface: -![collectionPrefiltered](../images/collectionPrefiltered.png) +![collectionPrefiltered](../../images/collectionPrefiltered.png) {: .img-responsive} Now you can either click **Auto pair** if pairs look good to you (proper combinations of datasets are listed in each line) or pair each forward/reverse group individually by pressing **Pair these datasets** button separating each pair: -![collectionCreation](../images/collectionCreation.png) +![collectionCreation](../../images/collectionCreation.png) {: .img-responsive} Now it is time to name the collection: -![collectionNaming](../images/collectionNaming.png) +![collectionNaming](../../images/collectionNaming.png) and create the collection by clicking **Create list**. A new item will appear in the history as you can see on the panel **A** below. Clicking on collection will expand it to show four pairs it contains (panel **B**). Clicking individual pairs will expand them further to reveal **forward** and **reverse** datasets (panel **C**). Expanding these further will enable one to see individual datasets (panel **D**). -![collection_ABCD](../images/collection_ABCD.png) +![collection_ABCD](../../images/collection_ABCD.png) {: .img-responsive} ## Using collections By now we see that a collection can be used to bundle a large number of items into a single history item. This means that many Galaxy tools will be able to process all datasets in a collection transparently to you. Let's try to map these datasets to human genome using `bwa-mem` mapper: -![bwa_mem_collection_readGroups](../images/bwa_mem_collection_readGroups.png) +![bwa_mem_collection_readGroups](../../images/bwa_mem_collection_readGroups.png) {: .img-responsive} Here is what you need to do: @@ -95,17 +95,17 @@ Here is what you need to do: You will see jobs being submitted and new datasets appearing in the history. IN particular below you can see that Galaxy has started four jobs (two yellow and two gray). This is because we have eight paired datasets with each pair being processed separately by `bwa-mem`. As a result we have four `bwa-mem` runs: -![bwa_memCollectionRunning](../images/bwa_memCollectionRunning.png) +![bwa_memCollectionRunning](../../images/bwa_memCollectionRunning.png) {: .img-responsive} Once these jobs are finished they will disappear from the history and all results will be represented as a new collection: -![bwa_memCollectionDone](../images/bwa_memCollectionDone.png) +![bwa_memCollectionDone](../../images/bwa_memCollectionDone.png) {: .img-responsive} Let's look at this collection by clicking on it (panel **A** in the figure below). You can see that now this collection is no longer paired (compared to the collection we created in the beginning of this tutorial). This is because `bwa-mem` takes forward and reverse data as input, but produces only a single BAM dataset as the output. So what we have in the result is a *list* of four dataset (BAM files; panels **B** and **C**). -![bwa_memCollection_ABC](../images/bwa_memCollection_ABC.png) +![bwa_memCollection_ABC](../../images/bwa_memCollection_ABC.png) {: .img-responsive} ## Processing collection as a single entity @@ -116,12 +116,12 @@ Now that `bwa-mem` has finished and generated a collection of BAM datasets we ca Let's perform cleanup of our BAM files with `cleanSam` utility from the **Picard** package: -![cleanSam](../images/cleanSam.png) +![cleanSam](../../images/cleanSam.png) {: .img-responsive} If you look at the picture above carefully, you will see that the **Select SAM/BAM dataset or dataset collection** parameter is empty (it says `No sam or bam datasets available.`). This is because we do not have single SAM or BAM datasets in the history. Instead we have a collection. So all you need to do is to click on the **folder** button and you will get our BAM collection selected: -![cleanSam_closeup](../images/cleanSam_closeup.png) +![cleanSam_closeup](../../images/cleanSam_closeup.png) {: .img-responsive} Click **Execute**. As an output this tool will produce a collection contained cleaned data. @@ -130,24 +130,24 @@ Click **Execute**. As an output this tool will produce a collection contained cl Now let's clean the dataset further by only preserving truly paired reads (reads satisfying two requirements: (1) read is paired, and (2) it is mapped as a proper pair). For this we will use `Filter SAM or BAM` tools from **SAMTools** collection: -![filter](../images/filter.png) +![filter](../../images/filter.png) {: .img-responsive} parameters should be set as shown below. By setting mapping quality to `20` we avoid reads mapping to multiple locations and by using **Filter on bitwise flag** option we ensure that the resulting dataset will contain only properly paired reads. This operation will produce yet another collection containing now filtered datasets. -![filter_closeup](../images/filter_closeup.png) +![filter_closeup](../../images/filter_closeup.png) {: .img-responsive} ### Merging collection into a single dataset The beauty of BAM datasets is that they can be combined in a single entity using so called *Read group*. This allows to bundle reads from multiple experiments into a single dataset where read identity is maintained by labelling every sequence with *read group* tags. So let's finally reduce this collection to a single BAM dataset. For this we will use `MergeSamFiles` tool for the `Picard` suite: -![merge](../images/merge.png) +![merge](../../images/merge.png) {: .img-responsive} Here we select the collection generated by the filtering tool described above: -![merge_closeup](../images/merge_closeup.png) +![merge_closeup](../../images/merge_closeup.png) {: .img-responsive} This operation will **not** generate a collection. Instead, it will generate a single BAM dataset containing mapped reads from our four samples (`M117-bl`, `M117-ch`, `M117C1-bl`, and `M117C1-ch`). @@ -156,21 +156,21 @@ This operation will **not** generate a collection. Instead, it will generate a s So we have one BAM dataset combining everything we've done so far. Let's look at the contents of this dataset using a genome browser. First, we will need to downsample the dataset to avoiding overwhelming the browser. For this we will use `Downsample SAM/BAM` tool: -![downsample](../images/downsample.png) +![downsample](../../images/downsample.png) {: .img-responsive} Set **Probability (between 0 and 1) that any given read will be kept** to roughly `5%` (or `0.05`) using the slider control: -![downsample_closeup](../images/downsample_closeup.png) +![downsample_closeup](../../images/downsample_closeup.png) This will generate another BAM dataset containing only 5% of the original reads and much smaller as a result. Click on this dataset and you will see links to various genome browsers: -![browserLinks](../images/browserLinks.png) +![browserLinks](../../images/browserLinks.png) {: .img-responsive} Click the **Human hg38** link in the **display with IGV** line as highlighted above (to learn more about displaying Galaxy data in IGV with this [movie](https://vimeo.com/123442619#t=4m16s)). Below is an example generated with IGV on these data. In this screenshot reads are colored by read group (four distinct colors). A yellow inset displays additional information about a single read. One can see that this read corresponds to read group `M117-bl`. -![igv](../images/igv.png) +![igv](../../images/igv.png) {: .img-responsive} ## We did not fake this: diff --git a/topics/usegalaxy/tutorials/dip/tutorial.md b/topics/usegalaxy/tutorials/dip/tutorial.md index 93276f36..ab91a64c 100644 --- a/topics/usegalaxy/tutorials/dip/tutorial.md +++ b/topics/usegalaxy/tutorials/dip/tutorial.md @@ -92,7 +92,7 @@ Now, the probability of having a variant and it being observed in our sequencing | | | | |:----------------------------------:|:----------------------------------:|:-----------------------------------:| -| ![](../images/pA.png) | ![](../images/pB.png) | ![](../images/pAB.png) | +| ![](../../images/pA.png) | ![](../../images/pB.png) | ![](../../images/pAB.png) | | $P(A)$
    Polymorphisms | $P(B)$
    Variant calls | $P(AB)$
    Polymorphisms + Varinat calls | Now we can ask the following question: *What is the probability of a having a real polymorphism* $A$ *given our observation of variants in reads* $B$? In other words *what is the probability of* $A$ *given* $B$? Or, as stated in the original [blog](https://oscarbonilla.com/2009/05/visualizing-bayes-theorem/): "*given that we are in region $B$ what is the probability that we are in the region $AB$*?": @@ -154,7 +154,7 @@ Freebayes is a *haplotype-based* variant caller. This implies that instead of lo | | |-------------------------------------------| -| ![](../images/freebayes.png) | +| ![](../../images/freebayes.png) | |Looking at a haplotype window makes misalignments tolerable. In this case a low complexity poly(A) stretch is misaligned. As a result looking at individual positions will result in calling multiple spurious varians. In the case of FreeBayes looking at a haplotype identifies two alleles (this is a diploid example) `A(7)` and `A(6)`, while `A(8)` is likely an error. Image by [Erik Garrison](https://github.com/ekg/freebayes)| # Let's try it @@ -175,15 +175,15 @@ Here is what to do to load the data: > >Go to the [data library](https://usegalaxy.org/library/list#folders/F9ff2d127cd7ed6bc) and select both BAM and PED datasets. Then Click **to History** button: > ->![](../images/library_import.png) +>![](../../images/library_import.png) > >Galaxy will ask you if you want to import these data into a new history, which you might want (in the case below I called this history `genotyping try`): > ->![](../images/history_import.png) +>![](../../images/history_import.png) > >The datasets will appear in your history: > ->![](../images/library_import_complete.png) +>![](../../images/library_import_complete.png) > {: .hands_on} @@ -194,11 +194,11 @@ Here is what to do to load the data: >Select **FreeBayes** from **NGS: Variant Analysis** section of the tool menu (left pane of Galaxy's interface). >Make sure the top part of the interface looks like shown below. Here we selected `GIAB-Ashkenazim-Trio-hg19` as input and set **Using reference genome** to `hg19` and **Choose parameter selection level** to `5`. The interface should look like this: > ->![](../images/FreeBayes_settings.png) +>![](../../images/FreeBayes_settings.png) > >Scrolling down to **Tweak algorithmic features?** click `Yes` and set **Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output** to `Yes`. This would help us evaluating the quality of genotype calls. > ->![](../images/freebayes_gq.png) +>![](../../images/freebayes_gq.png) > >Depending on how busy Galaxy is this may take a little bit of time (coffee break?). Eventially this will produce a dataset in [VCF](http://www.1000genomes.org/wiki/Analysis/variant-call-format) format containing 35 putative variants. Before we can continue we need to post-process this dataset by breaking compound variants into multiple independent variants with **VcfAllelicPrimitives** tool found within **NGS: VCF Manipulation** section. This is necessary for ensuring the smooth sailing through downstream analyses: > @@ -208,7 +208,7 @@ Here is what to do to load the data: > >Select FreeBayes output as the input for this tool and make sure **Maintain site and allele-level annotations when decomposing** and **Maintain genotype-level annotations when decomposing** are set to `Yes`: > ->![](../images/vcfallelicprimitives.png) +>![](../../images/vcfallelicprimitives.png) > {: .hands_on} @@ -235,15 +235,15 @@ At this point we are ready to begin annotating variants using [SnpEff](http://sn > >Select **NGS: Variant Analysis** → **SnpEff**. Select the latest version of annotation database matching genome version against which reads were mapped and VCF produced. In this case it is `GRCh37.75: hg19`: > ->![](../images/snpeff.png) +>![](../../images/snpeff.png) > >SnpEff will generate two outputs: (1) an annotated VCF file and (2) an HTML report. The report contains a number of useful metrics such as distribution of variants across gene features: > ->![](../images/snpeff_chart.png) +>![](../../images/snpeff_chart.png) > >or changes to codons: > ->![](../images/snpeff_codons.png) +>![](../../images/snpeff_codons.png) > {: .hands_on} @@ -267,11 +267,11 @@ The first step is to convert a VCF file we would like to analyze into a GEMINI d > >So let's load data into GEMINI. Set VCF and PED inputs: > ->![](../images/gemini_load.png) +>![](../../images/gemini_load.png) > >This creates a sqlite database. To see the content of the database use **GEMINI_db_info**: > ->![](../images/gemini_db_info.png) +>![](../../images/gemini_db_info.png) > >This produce a list of [all tables and fields](https://github.com/nekrut/galaxy/wiki/datasets/gemini_tables.txt) in the database. > @@ -294,7 +294,7 @@ The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/ge > >into **The query to be issued to the database** field of the interface: > ->![](../images/gemini_query1.png) +>![](../../images/gemini_query1.png) > >As we can see from [output (Click this link to see it)](https://usegalaxy.org/datasets/bbd44e69cb8906b51bb37b9032761321/display/?preview=True) there are 21 variants that are not annotated in dbSNP. > @@ -345,7 +345,7 @@ GEMINI provides access to genotype, sequencing depth, genotype quality, and geno >gt_types.HG002_NA24385_son <> HOM_REF >``` > ->![](../images/gemini_query2.png) +>![](../../images/gemini_query2.png) > >This produce [a list of sites](https://usegalaxy.org/datasets/bbd44e69cb8906b560921700703d0255/display/?preview=True) > @@ -444,18 +444,18 @@ This short tutorial should give you an overall idea on how generate variant data * Right click on **Galaxy history** link and open Galaxy history in another new browser tab * When Galaxy history interface opens you will need to click **Import history** link highlighted within a red outline in the following figure: -![](../images/import_history.png) +![](../../images/import_history.png) * If you have a wide screen arrange browsers tabs side by side: -![](../images/side-by-side.png) +![](../../images/side-by-side.png) * Proceed with tutorial. For example, to repeat the following command from GEMINI tutorial: -![](../images/gemini_command.png) +![](../../images/gemini_command.png) * Use Galaxy's **GEMINI_load** tool: -![](../images/galaxy_command.png) +![](../../images/galaxy_command.png) * and so on.... diff --git a/topics/usegalaxy/tutorials/dunovo/tutorial.md b/topics/usegalaxy/tutorials/dunovo/tutorial.md index 46adf7e4..96a67063 100644 --- a/topics/usegalaxy/tutorials/dunovo/tutorial.md +++ b/topics/usegalaxy/tutorials/dunovo/tutorial.md @@ -11,7 +11,7 @@ This page explains how to perform discovery of low frequency variants from duple Calling low frequency variants from next generation sequencing (NGS) data is challenging due to significant amount of noise characteristic of these technologies. [Duplex sequencing](http://www.pnas.org/content/109/36/14508.short) (DS) was designed to address this problem by increasing sequencing accuracy by over four orders of magnitude. DS uses randomly generated barcodes to uniquely tag each molecule in a sample. The tagged fragments are then PCR amplified prior to the preparation of a sequencing library, creating fragment families characterized by unique combination of barcodes at both 5’ and 3’ ends: ->[![duplex](../images/ds.png)](http://www.pnas.org/content/109/36/14508/F1.expansion.html) +>[![duplex](../../images/ds.png)](http://www.pnas.org/content/109/36/14508/F1.expansion.html) > >The logic of duplex sequencing. From [Schmitt:2012](http://www.pnas.org/content/109/36/14508.short). @@ -22,7 +22,7 @@ The computational analysis of DS data (Part `C` in the figure above) produces tw The DCSs have the ultimate accuracy, yet the SSCSs can also be very useful when ampliconic DNA is used as an input to a DS experiment. Let us illustrate the utility of SSCSs with the following example. Suppose one is interested in quantifying variants in a virus that has a very low titer in body fluids. Since DS procedure requires a substantial amount of starting DNA (between [between 0.2 and 3 micrograms](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4271547/)) the virus needs to be enriched. This can be done, for example, with a PCR designed to amplify the entire genome of the virus. Yet the problem is that during the amplification heterologous strands will almost certainly realign to some extent forming hetoroduplex molecules: ->![hd](../images/het.png) +>![hd](../../images/het.png) > >Heteroduplex formation in ampliconic templates. Image by Barbara Arbeithuber from [Stoler:2016](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-1039-4). Here there are two distinct types of viral genomes: carrying `A` and `G`. Because the population of genomes is enriched via PCR, heteroduplex formation takes place, skewing frequency estimates performed using DCSs. @@ -42,7 +42,7 @@ This analysis (and consequently the Galaxy's history) can be divided into three 3. Analysis of Single Strand Consensus Sequences (SSCS): ->![steps](../images/steps.png) +>![steps](../../images/steps.png) > >Analysis outline @@ -56,7 +56,7 @@ We uploaded [Schmitt:2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4414912/ | :--|:--- -![](../images/abl1-f-qc.png) | ![](../images/abl1-r-qc.png) +![](../../images/abl1-f-qc.png) | ![](../../images/abl1-r-qc.png) **A**. Forward | **B**. Reverse One can see that these data are of excellent quality and no additional processing is required before we can start the actual analysis. @@ -71,7 +71,7 @@ From tool section **NGS: Du Novo** we ran: This is the exact image of the **Make consensus reads** interface: ->![Make consesni](../images/makeCons.png) +>![Make consesni](../../images/makeCons.png) > >Making DCS and SSCS. **Note** that **Output single-strand consensus sequences** is set to `Yes`. [Above](#background) we explained why single-strand consensus sequences (SSCS) may be important in some applications. [Below](#analysis-of-single-strand-consensus-data) we show how they can be used. @@ -81,7 +81,7 @@ This is the exact image of the **Make consensus reads** interface: The _Du Novo_ algorithm occasionally inserts`N`and/or [IUPAC notations](https://en.wikipedia.org/wiki/Nucleic_acid_notation) at sites where a definive base cannot be identified according to the major rule consensus. We however do not want such bases when we call variants. The tool **Sequence Content Trimmer** will help with filtering these out. Here are the parameters we used: ->![ContentTrimmer](../images/contentTrimmer.png) +>![ContentTrimmer](../../images/contentTrimmer.png) > >Sequence Content Trimmer settings . Where:
    - `Paired reads = Paired` (because DCSs are reported as forward and reverse)
    - `Bases to filter on = NRYSWKMBDHV` (all ambiguous nucleotides)
    - `Frequency threshold = 0.2` (A window /see the next parameter below/ cannot have more than 20% of ambiguous bases)
    - `Size of the window = 10` (Size of the window)
    - `Invert filter bases = No`
    - `Set a minimum read length = 50` (We do not want _very_ short reads) @@ -89,9 +89,9 @@ The _Du Novo_ algorithm occasionally inserts`N`and/or [IUPAC notations](https:// [The previous step](#filtering-consensuses) filters forward and reverse DCSs and reports them in [FASTA](https://en.wikipedia.org/wiki/FASTA_format) format. Yet the downstream tools require [fastq](https://en.wikipedia.org/wiki/FASTQ_format) format. To address this we convert FASTA into fastq using **Combine FASTA and QUAL** from tool section **NGS: QC and manipulation**. In this case the quality values are filled in with the maximum allowed value of 93 (essentially we fake them here), which is fine as we will not rely on quality scores in the rest of the analysis. ->![ContentTrimmer](../images/combineFandQ.png) +>![ContentTrimmer](../../images/combineFandQ.png) > ->Combine FASTA and QUAL. **Note** that here two datasets (#8 and #9) are selected simultaneously because we clicked the multiple datasets button the left of the **FASTA File** dropdown:
    ![](../images/multiDataset.png) +>Combine FASTA and QUAL. **Note** that here two datasets (#8 and #9) are selected simultaneously because we clicked the multiple datasets button the left of the **FASTA File** dropdown:
    ![](../../images/multiDataset.png) ## Calling variants @@ -106,13 +106,13 @@ At this point we have trimmed DCSs in fastq format. We can now proceed to callin Here we use two mappers for added reliability (this is not necessary in most situations as long as you use the right mapper for input data). To differentiate between results produced by each mapper we assign readgroups (this is done by clicking on **Set read groups information** dropdown). For example, for **BWA-MEM** you would set parameters like this: ->![](../images/bwa-mem.png) +>![](../../images/bwa-mem.png) > >Running BWA-MEM. **Note** that we are comparing DCSs against human genome version `hg38`, use forward and reverse DCSs are the `first` and `second` set of reads. Readgroup **SM** and **ID** tags are set `bwa-mem`. We then repeat essentially the same with **BWA**: ->![](../images/bwa.png) +>![](../../images/bwa.png) > >Running BWA. **Note** here we use `bwa` as the readgroup **ID** and **SM** tags. @@ -120,7 +120,7 @@ We then repeat essentially the same with **BWA**: Since we have used two mappers - we have two BAM datasets. Yet because we have set readgroups we can now merge them into a single BAM dataset. This is because the individual reads will be labelled with readgroups (you will see how it will help later). To merge we use **MergeSamFiles** from tool section **NGS: Picard**: ->![](../images/mergeSamFiles.png) +>![](../../images/mergeSamFiles.png) > >Merging BAM datasets. @@ -128,7 +128,7 @@ Since we have used two mappers - we have two BAM datasets. Yet because we have s To normalize the positional distribution of indels we use **Left Align** utility (**NGS: Variant Analysis**) from [FreeBayes](https://github.com/ekg/freebayes#indels) package. This is necessary to avoid erroneous polymorphisms flanking regions with indels (e.g., in low complexity loci): ->![](../images/leftAlign.png) +>![](../../images/leftAlign.png) > >Left aligning indels. **Note** here we use `hg38` as well. Obviously, one must use the same genome built you have aligned against with **BWA-MEM** and **BWA**. @@ -136,21 +136,21 @@ To normalize the positional distribution of indels we use **Left Align** utility To identify sites containing variants we use **Naive Variant Caller (NVC)** (tool section **NGS: Variant Analysis**) which produces a simple count of differences given coverage and base quality per site (remember that our qualities were "faked" during the conversion from FASTA to fastq and cannot be used here). So in the case of _ABL1_ we set parameters as follow: ->![](../images/nvc.png) +>![](../../images/nvc.png) > >Finding variants with NVC. Here:
    - `Using reference genome = hg38` (As mentioned above, needs to be set to the same genome one have mapped against.)
    - `Restrict to regions: Chromosome = chr9` (_ABL1_ is on chromosome 9. We set this to prevent **NVC** from wandering across the genome to save time.)
    - `Minimum number of reads needed to consider a REF/ALT = 0` (Trying to maximize the number of sites. We can filter later.)
    - `Minimum base quality = 20` (This default and is irrelevant because of "faking" quality scores during the conversion from FASTA to fastq).
    - `Minimum mapping quality = 20` (This is helpful because it prevents reads mapping to multiple locations from being included in the tabulation. Such reads will have mapping quality of 0.)
    - `Ploidy = 1` (Ploidy is irrelevant here as it is a mixture of multiple genomes)
    - `Only write out positions with possible alternate alleles = No` (We can filter later)
    - `Report counts by strand = Yes` (This will be helpful to gauge the strand bias). The **NVC** generates a [VCF](https://en.wikipedia.org/wiki/Variant_Call_Format) file that can be viewed at genome browsers such as [IGV](https://www.broadinstitute.org/igv/). Yet one rarely finds variants by looking at genome browsers. The next step is to generate a tab-delimited dataset of nucleotide counts using **Variant Annotator** from tool section **NGS: Variant Analysis**. We ran it with the following parameters: ->![](../images/va.png) +>![](../../images/va.png) > >Annotating variable sites. Here `Coverage threshold = 10` (To reduce noise) and `Output stranded base counts = Yes` (to see strand bias) There are 3,264 lines in the output, which is clearly too much. Using **Filter** tool (tool section **Filter and Sort**) with expression `c16 >= 0.01`(because column 16 contains minor allele frequency - MAF - and we are interested in those sites where MAF >= 1%): ->![](../images/filter.png) +>![](../../images/filter.png) > >Filtering variable sites. @@ -186,12 +186,12 @@ The analysis described above can be rerun using a workflow. Workflow combined al * _Du Novo_ analysis from reads (import from [here](https://usegalaxy.org/u/aun1/w/duplex-analysis-from-reads)). This workflow uses fastq reads as input. It should be used if you analyze data for first time. * _Du Novo_ analysis from aligned families (import from [here](https://usegalaxy.org/u/aun1/w/copy-of-duplex-analysis-from-reads)). This workflow starts with aligned families. It should be used for re-analysis of already generated DCS and SSCS data. ->[![](../images/fromReads.png)](https://galaxyproject.org/duplex/fromReads.png) +>[![](../../images/fromReads.png)](https://galaxyproject.org/duplex/fromReads.png) > >Starting from Reads ->[![](../images/fromDCS.png)](https://galaxyproject.org/duplex/fromDCS.png) +>[![](../../images/fromDCS.png)](https://galaxyproject.org/duplex/fromDCS.png) > >Starting from DCS/SSCS data diff --git a/topics/usegalaxy/tutorials/ngs/tutorial.md b/topics/usegalaxy/tutorials/ngs/tutorial.md index bf15697a..38b528e7 100644 --- a/topics/usegalaxy/tutorials/ngs/tutorial.md +++ b/topics/usegalaxy/tutorials/ngs/tutorial.md @@ -88,7 +88,7 @@ It is common to prepare pair-end and mate-pair sequencing libraries. This is hig | | |----| -| ![](../images/pe_mp.png) | +| ![](../../images/pe_mp.png) | |**Paired-end and mate-pair reads**. In paired end sequencing (left) the actual ends of rather short DNA molecules (less than 1kb) are determined, while for mate pair sequencing (right) the ends of long molecules are joined and prepared in special sequencing libraries. In these mate pair protocols, the ends of long, size-selected molecules are connected with an internal adapter sequence (i.e. linker, yellow) in a circularization reaction. The circular molecule is then processed using restriction enzymes or fragmentation. Fragments are enriched for the linker and outer library adapters are added around the two combined molecule ends. The internal adapter can then be used as a second priming site for an additional sequencing reaction in the same orientation or sequencing can be performed from the second adapter, from the reverse strand. (From Ph.D. dissertation by [Martin Kircher](https://core.ac.uk/download/pdf/35186947.pdf))| @@ -170,7 +170,7 @@ The base qualities allow us to judge how trustworthy each base in a sequencing r Illumina sequencing is based on identifying the individual nucleotides by the fluorescence signal emitted upon their incorporation into the growing sequencing read. Once the fluorescence intensities are extracted and translated into the four letter code. The deduction of nucleotide sequences from the images acquired during sequencing is commonly referred to as base calling. Due to the imperfect nature of the sequencing process and limitations of the optical instruments, base calling will always have inherent uncertainty. This is the reason why FASTQ files store the DNA sequence of each read together with a position-specific quality score that represents the error probability, i.e., how likely it is that an individual base call may be incorrect. The score is called [Phred score](http://www.phrap.com/phred/), $Q$, which is proportional to the probability $p$ that a base call is incorrect, where $Q = −10lg(p)$. For example, a Phred score of 10 corresponds to one error in every ten base calls ($Q = −10lg(0.1)$), or 90% accuracy; a Phred score of 20 corresponds to one error in every 100 base calls, or 99% accuracy. A higher Phred score thus reflects higher confidence in the reported base. To assign each base a unique score identifier (instead of numbers of varying character length), Phred scores are typically represented as ASCII characters. At http://ascii-code.com/ you can see which characters are assigned to what number. For raw reads, the range of scores will depend on the sequencing technology and the base caller used (Illumina, for example, used a tool called Bustard, or, more recently, RTA). Unfortunately, Illumina has been anything but consistent in how they calculated and ASCII-encoded the Phred score (see below)! In addition, Illumina now allows Phred scores for base calls with as high as 45, while 41 used to be the maximum score until the HiSeq X. This may cause issues with downstream sapplications that expect an upper limit of 41. -![](../images/illumina_qs.png) +![](../../images/illumina_qs.png) @@ -180,7 +180,7 @@ Sanger/Phred format that is also used by other sequencing platforms and the sequ | | |--------------------------------------------------------------| -| ![](../images/fastq_qs.png) | +| ![](../../images/fastq_qs.png) | | The ASCII interpretation and ranges of the different Phred score notations used by Illumina and the original Sanger interpretation. Although the Sanger format allows a theoretical score of 93, raw sequencing reads typically do not exceed a Phred score of 60. In fact, most Illumina-based sequencing will result in maximum scores of 41 to 45 (image from [Wikipedia](https://en.wikipedia.org/wiki/FASTQ_format) | ## Assessing data quality @@ -189,7 +189,7 @@ One of the first steps in the analysis of NGS data is seeing how good the data a | | | |:---------------------------------------|:-----------------------------------| -| ![](../images/good_fq.png) | ![](../images/bad_fq.png) | +| ![](../../images/good_fq.png) | ![](../../images/bad_fq.png) | |**A.** Excellent quality | **B.** Hmmm...OK | Here you can see FastQC base quality reports (the tools gives you many other types of data) for two datasets: **A** and **B**. The **A** dataset has long reads (250 bp) and very good quality profile with no qualities dropping below [phred score](http://www.phrap.com/phred/) of 30. The **B** dataset is significantly worse with ends of the reads dipping below phred score of 20. The **B** reads may need to be trimmed for further processing. @@ -216,7 +216,7 @@ Mappers usually compare reads against a reference sequence that has been transfo | | |--------------------------------------------------------------| -| ![](../images/cached_genome.png) | +| ![](../../images/cached_genome.png) | |Mapping against a pre-computed index in Galaxy.| For example, the image above shows indexes for `hg38` version of the human genome. You can see that there are actually three choices: (1) `hg38`, (2) `hg38 canonical` and (3) `hg38 canonical female`. The `hg38` contains all chromosomes as well as all unplaced contigs. The `hg38 canonical` does not contain unplaced sequences and only consists of chromosomes 1 through 22, X, Y, and mitochondria. The @@ -236,7 +236,7 @@ If Galaxy does not have a genome you need to map against, you can upload your ge | | |--------------------------------------------------------------| -| ![](../images/uploaded_genome.png) | +| ![](../../images/uploaded_genome.png) | |Mapping against a pre-computed index in Galaxy | In this case Galaxy will first create an index from this dataset and then run mapping analysis against it. The following video shows how this works in practice: @@ -253,7 +253,7 @@ As shown below, SAM files typically contain a short header section and a very lo | | |--------------------------------------------------------------| -| ![](../images/bam_structure.png) | +| ![](../../images/bam_structure.png) | |**Schematic representation of a SAM file**. Each line of the optional header section starts with “@”, followed by the appropriate abbreviation (e.g., SQ for sequence dictionary which lists all chromosomes names (SN) and their lengths (LN)). The vast majority of lines within a SAM file typically correspond to read alignments where each read is described by the 11 mandatory entries (black font) and a variable number of optional fields (grey font). From [tutorial](http://chagall.med.cornell.edu/RNASEQcourse/Intro2RNAseq.pdf) by Friederike Dündar, Luce Skrabanek, and Paul Zumbo.| ## SAM Header @@ -282,7 +282,7 @@ ERR458493 .552967 16 chrI 140 255 12 M61232N37M2S * 0 0 CCACTCGTTCACCAGGGCCGGCGG The following table explains the format and content of each field. The `FLAG`, `CIGAR`, and the optional fields (marked in blue) are explained in more detail below. The number of optional fields can vary widely between different SAM files and even between reads within in the same file. The field types marked in blue are explained in more detail in the main text below. -![](../images/sam_fields.png) +![](../../images/sam_fields.png) ### `FLAG` field @@ -292,7 +292,7 @@ The following table gives an overview of the different properties that can be en | | |--------------------------------------------------------------| -| ![](../images/sam_flag.png) | +| ![](../../images/sam_flag.png) | |The `FLAG` field of SAM files stores information about the respective read alignment in one single decimal number. The decimal number is the sum of all the answers to the Yes/No questions associated with each binary bit. The hexadecimal representation is used to refer to the individual bits (questions). A bit is set if the corresponding state is true. For example, if a read is paired, `0x1` will be set, returning the decimal value of 1. Therefore, all `FLAG` values associated with paired reads must be uneven decimal numbers. Conversely, if the `0x1` bit is unset (= read is not paired), no assumptions can be made about `0x2`, `0x8`, `0x20`, `0x40` and `0x80` because they refer to paired reads. From [tutorial](http://chagall.med.cornell.edu/RNASEQcourse/Intro2RNAseq.pdf) by Friederike Dündar, Luce Skrabanek, and Paul Zumbo| In a run with single reads, the flags you most commonly see are: @@ -339,7 +339,7 @@ The sum of lengths of the **M**, **I**, **S**, **=**, **X** operations must equa | | |---------------------------------| -|![](../images/cigar.png)| +|![](../../images/cigar.png)| |From [tutorial](http://chagall.med.cornell.edu/RNASEQcourse/Intro2RNAseq.pdf) by Friederike Dündar, Luce Skrabanek, and Paul Zumbo.| ### Optional fields @@ -371,11 +371,11 @@ One of the key features of SAM/BAM format is the ability to label individual rea One of the best descriptions of BAM readgroups is on [GATK support site](http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files). We have gratefully stolen two tables describing the most important readgroup tags - `ID`, `SM`, `LB`, and `PL` - from GATK forum and provide them here: -![](../images/rg.png) +![](../../images/rg.png) GATK forum also provides the following example: -![](../images/rg_example.png) +![](../../images/rg_example.png) To see an example of read group manipulation in Galaxy see the following video: @@ -406,7 +406,7 @@ Preparation of sequencing libraries (at least at the time of writing) for techno | | |----------------------------------------------| -| ![](../images/pcr-duplicates.png) | +| ![](../../images/pcr-duplicates.png) | |Analyzing molecules aligning with the same outer coordinates, a mapping quality of at least 30 and a length of at least 30nt, resulted in an average coverage of 12.9 per PCR duplicate and an empirical coverage distribution similar to an exponential/power law distribution (left upper panel). This indicates that many molecules are only observed for deeper sequencing while other molecules are available at higher frequencies. Analyzing length (left middle panel) and GC content (left lower panel) patterns as well as the combination (right panel) shows higher PCR duplicate counts for a GC content between 30% to 70% as well as for shorter molecules compared to longer molecules. This effect may be due to an amplification bias from the polymerase or the cluster generation process necessary for Illumina sequencing. From Ph.D. dissertation of [Martin Kircher](http://www.qucosa.de/fileadmin/data/qucosa/documents/7110/pflichtexemplar_final.pdf)).| Duplicates can be identified based on their outer alignment coordinates or using sequence-based clustering. One of the common ways for identification of duplicate reads is the `MarkDuplicates` utility from [Picard](https://broadinstitute.github.io/picard/command-line-overview.html) package. It is designed to identify both PCR and optical duplicates: @@ -423,5 +423,5 @@ However, one has to be careful when removing duplicates in cases when the sequen | | |----------------------------------------------| -| ![](../images/sampling-bias.png) | +| ![](../../images/sampling-bias.png) | | The Variant Allele Frequency (VAF) bias determined by coverage and insert size variance. Reads are paired-end and read length is 76. The insert size distribution is modeled as a Gaussian distribution with mean at 200 and standard deviation shown on the x-axis. The true VAF is 0.05. The darkness at each position indicates the magnitude of the bias in the VAF. (From Zhou et al. [2013](http://bioinformatics.oxfordjournals.org/content/30/8/1073)). | diff --git a/topics/usegalaxy/tutorials/non-dip/tutorial.md b/topics/usegalaxy/tutorials/non-dip/tutorial.md index c0cec573..a6ff958a 100644 --- a/topics/usegalaxy/tutorials/non-dip/tutorial.md +++ b/topics/usegalaxy/tutorials/non-dip/tutorial.md @@ -21,7 +21,7 @@ There are two ways one can call variants: | | |--------------------------| -| ![](../images/ref_vs_assembly.jpg) | +| ![](../../images/ref_vs_assembly.jpg) | | This figure from a manuscript by [Olson:2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4493402/) contrasts the two approaches. | In this tutorials we will take the *first* path is which we map reads against an existing assembly. Later in the course (after we learn about assembly approaches) we will try the second approach as well. @@ -36,7 +36,7 @@ For this tutorial we have prepared a subset of data previously [published](http: > ### Data upload from a Galaxy Library > -> ![](../images/mt_lib.png) +> ![](../../images/mt_lib.png) > > * Go to this [this Galaxy library](https://usegalaxy.org/library/list#folders/Fe4842bd0c37b03a7) > * You will see screen like the one shown above @@ -46,7 +46,7 @@ For this tutorial we have prepared a subset of data previously [published](http: > * click **Import** > * A green message will appear once the import is done. Click on it and will see the history you have just created. It will be populated with the four datasets as shown below: > -> ![](../images/mt_imported_data.png) +> ![](../../images/mt_imported_data.png) > {: .hands_on} @@ -57,13 +57,13 @@ Before proceeding with the analysis, we need to find out how good the data actua > ### Quality Control of the data > > -> ![](../images/mt_qc.png) +> ![](../../images/mt_qc.png) > ->QC'ing reads using [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Note that we selected all four datasets at once by pressing the middle button ![](../images/mt_middle_button.png) adjacent to the **Short read data from your current history** widget. Once `FastQC` job runs, you will be able to look at the HTML reports generated by this tool. +>QC'ing reads using [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Note that we selected all four datasets at once by pressing the middle button ![](../../images/mt_middle_button.png) adjacent to the **Short read data from your current history** widget. Once `FastQC` job runs, you will be able to look at the HTML reports generated by this tool. > >The data have generally high quality in this example: > ->![](../images/mt_qc_plot.png) +>![](../../images/mt_qc_plot.png) > >FastQC plot for one of the mitochondrial datasets shows that qualities are acceptable for 250 bp reads (mostly in the green, which is at or above [phred score](https://en.wikipedia.org/wiki/Phred_quality_score) of 30). {: .hands_on} @@ -75,12 +75,12 @@ Our reads are long (250 bp) and as a result we will be using [bwa mem](https://a > ### Mapping with `bwa mem` > ->![](../images/mt_bwa_mem.png) +>![](../../images/mt_bwa_mem.png) > >Running `bwa mem` on our datasets. Look **carefully** at parameter settings: > > * We select `hg38` version of the human genome as the reference -> * By using the middle button again ![](../images/mt_middle_button.png) we select datasets 1 and 3 as **Select the first set of reads** and datasets 2 and 4 as **Select the second set of reads**. Galaxy will automatically launch two bwa-mem jobs using datasets 1,2 and 3,4 generating two resulting BAM files. +> * By using the middle button again ![](../../images/mt_middle_button.png) we select datasets 1 and 3 as **Select the first set of reads** and datasets 2 and 4 as **Select the second set of reads**. Galaxy will automatically launch two bwa-mem jobs using datasets 1,2 and 3,4 generating two resulting BAM files. > * By setting **Set read groups information** to `Set read groups (SAM/BAM specifications)` and clicking **Auto-assign** we will ensure that the reads in the resulting BAM dataset are properly set. {: .hands_on} @@ -92,7 +92,7 @@ We can BAM dataset using **NGS: Picard** → **MergeSAMFiles** tool: > ### Merging multiple datasets into one > ->![](../images/mt_bam_merging.png) +>![](../../images/mt_bam_merging.png) > >Merging two BAM datasets into one. Note that two inputs are highlighted. {: .hands_on} @@ -109,7 +109,7 @@ Let's use **NGS: Picard** → **MarkDuplicates** tool: > ### De-duplicating mapped data > ->![](../images/mt_dedup.png) +>![](../../images/mt_dedup.png) > >De-duplicating the merged BAM dataset {: .hands_on} @@ -166,7 +166,7 @@ Let's perform left alignment using **NGS: Variant Analysis** → **BamLeftAl > ### Left-aligning indels > ->![](../images/mt_left_align.png) +>![](../../images/mt_left_align.png) > >Left-aligning a de-duplicated BAM dataset {: .hands_on} @@ -177,7 +177,7 @@ Remember that we are trying to call variants in mitochondrial genome. Let focus > ### Filtering BAM data > ->![](../images/mt_filtering.png) +>![](../../images/mt_filtering.png) > >Filtering reads. There are several important point to note here: > @@ -193,19 +193,19 @@ FreeBayes is widely used for calling variants in diploid systems. However, it ca > ### Running `FreeBayes` > ->![](../images/mt_freebayes_genome.png) +>![](../../images/mt_freebayes_genome.png) > >Set genome to `hg38` (the latest version) > ->![](../images/mt_freebayes_regions.png) +>![](../../images/mt_freebayes_regions.png) > >Set regions to `chrM` from `1` to `16000`. This will simply save us time since we are only interested in mitochondrial variants anyway > ->![](../images/mt_freebayes_alloptions.png) +>![](../../images/mt_freebayes_alloptions.png) > >Choose `Complete list of all samples` from **Choose parameter selection level** drop down. > ->![](../images/mt_freebayes_popmodel.png) +>![](../../images/mt_freebayes_popmodel.png) > >This is one of the most important parameter choices one needs to make when calling variants in non-diploid systems. Here set **Set population model** to `Yes` and then: > @@ -213,7 +213,7 @@ FreeBayes is widely used for calling variants in diploid systems. However, it ca >* Set **Assume that samples result from pooled sequencing** to `Yes` >* Set **Output all alleles which pass input filters, regardless of genotyping outcome or model** to `Yes` > ->![](../images/mt_freebayes_allelic_scope.png) +>![](../../images/mt_freebayes_allelic_scope.png) > >We will also set **Allelic scope** to `Yes` and restrict variant types to single nucleotide polymorphisms only by: > @@ -222,7 +222,7 @@ FreeBayes is widely used for calling variants in diploid systems. However, it ca > >Mitochondria has a number of low complexity regions (mononucleotide repeats). Setting these parameters as described above will decrease noise from these regions. > ->![](../images/mt_freebayes_inputfilters.png) +>![](../../images/mt_freebayes_inputfilters.png) > >Finally, let's set **Input filters** to `Yes` and set: > @@ -330,8 +330,8 @@ Even though we selected somewhat stringent input parameters (restricting base qu | | |----------------------------| -|![](../images/mt_biases.png)| -|Here you can see that in an ideal case (indicated with a green star) a variant is evenly represent by different areas of sequencing reads (cycle and placement biases) and is balanced across the two strands (strand bias). Allele imbalance is not applicable in our case as it reflects significant deviation from the diploid (50/50) expectation (see [here](../images/freebayes.pdf) for more details).| +|![](../../images/mt_biases.png)| +|Here you can see that in an ideal case (indicated with a green star) a variant is evenly represent by different areas of sequencing reads (cycle and placement biases) and is balanced across the two strands (strand bias). Allele imbalance is not applicable in our case as it reflects significant deviation from the diploid (50/50) expectation (see [here](../../images/freebayes.pdf) for more details).| A robust tool set for processing VCF data is provided by [vcflib](https://github.com/vcflib/vcflib) developed by Erik Garrison, the author of FreeBayes. One way to filter VCF is using `INFO` fields of the VCF dataset. If you look at the VCF dataset shown above you will see all comment lines beginning with `##INFO`. These are `INFO` fields. Each VCF record contains a list of `INFO` tags describing a wide range of properties for each VCF record. You will see that FreeBayes and NVC differ significantly in the number and types of `INFO` fields each of these caller generates. This why the two require different filtering strategies. @@ -346,7 +346,7 @@ Among numerous types of data generated by FreeBayes let's consider the following To perform filtering we will use **NGS: VCF Manipulation** → **VCFfilter**): > ### Filtering VCF data > ->![](../images/mt_vcffilter.png) +>![](../../images/mt_vcffilter.png) > >Filtering FreeBayes VCF for strand bias (`SPR` and `SAP`), placement bias (`EPP`), variant quality (`QUAL`), and depth of coverage (`DP`). {: .hands_on} @@ -371,16 +371,16 @@ VCF.IOBIO can be invoked by expanding a VCF dataset in Galaxy's history by click > ### Displaying data in VCF.IOBIO > ->![](../images/mt_vcf_dataset_collapsed.png) +>![](../../images/mt_vcf_dataset_collapsed.png) > >Clicking on the dataset above will expand it as shown below: > ->![](../images/mt_vcf_dataset_expanded.png) +>![](../../images/mt_vcf_dataset_expanded.png) > >At the bottom there is a link "display at vcf.iobio" >Clicking on this link will start indexing of VCF datasets, which is required to display them. After indexing VCF.IOBIO will open: > ->![](../images/mt_vcfiobio.png) +>![](../../images/mt_vcfiobio.png) > >Of course there are not that many variants to look at in this example. Nevertheless there are helpful statistics such as Transition/Transversion (Ts/Tn) ratio. {: .hands_on} @@ -391,7 +391,7 @@ Similarly to VCF.BIOIO expanding a history item representing a VCF dataset will > ### Displaying data in IGV > ->![](../images/mt_vcf_dataset_expanded.png) +>![](../../images/mt_vcf_dataset_expanded.png) > >At the bottom there is a link "display at IGV: local Human hg38" >The difference between "local" and "Human hg38" links is explained in the following video: @@ -400,7 +400,7 @@ Similarly to VCF.BIOIO expanding a history item representing a VCF dataset will > >Visualizing our FreeBayes dataset will produce this: > ->![](../images/mt_igv.png) +>![](../../images/mt_igv.png) > >Here we focus on one particular variant at position 3,243 for reasons that will become apparent in the next section. {: .hands_on} @@ -413,13 +413,13 @@ Using **NGS: VCF Manipulation** → **VCFtoTab-delimited** on the filtered V > ### From VCF to Tab-delimited data > ->![](../images/mt_vcfToTab.png) +>![](../../images/mt_vcfToTab.png) > >Make sure **Report data per sample** is set to `Yes` > >This will produce a dataset with *very* many columns: > ->![](../images/mt_tab.png) +>![](../../images/mt_tab.png) > >There are 53 columns in this dataset (not all are shown here). {: .hands_on} @@ -437,7 +437,7 @@ To cut these columns out we will use **Text Manipulation** → **Cut** > ### Cutting columns from a file > ->![](../images/mt_cut.png) +>![](../../images/mt_cut.png) > >Note that column names are pre-ceded with `c` > diff --git a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md index 80ee6884..f4a4fb27 100644 --- a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md +++ b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md @@ -19,7 +19,7 @@ In this lesson we will focus on the **Reference genome-based** type of RNA seq. The *Everything's connected* slide by Dündar et al. (2015) explains the overall idea: -![](../images/everything_connected.png) +![](../../images/everything_connected.png) There is a variety of ways in which RNA is treated during its conversion to cDNA and eventual preparation of sequencing libraries. In general the experimental workflow includes the following steps: @@ -38,7 +38,7 @@ In listing these basic steps we are ignoring a vast amount of details such as, f Reverse Transcriptase (RT) requires a primer. One can leverage the fact that the majority of processed mRNAs are polyadenylated and use oligo-dT primer to (mostly) restrict cDNA synthesis to fully processed mRNAs. Alternatively one can use a mix of random oligonucleotides to prime RT at a multitude of internal sites irrespective of RNA type and maturation status: ->![](../images/dT_random.png) +>![](../../images/dT_random.png) > >**Oligo-dT vs. random priming**
    >Oligo-dT (**A**) and random priming (**B**) @@ -49,21 +49,21 @@ Depending on the choice of the approach one would have different types of RNAs i RNAs that are typically targeted in RNAseq experiments are single stranded (e.g., mRNAs) and thus have polarity (5' and 3' ends that are functionally distinct): ->![](../images/dna_rna.png) +>![](../../images/dna_rna.png) > >**Relationship between DNA and RNA orientation** During a typical RNAseq experiment the information about strandedness is lost after both strands of cDNA are synthesized, size selected, and converted into sequencing library. However, this information can be quite useful for various aspects of RNAseq analysis such as transcript reconstruction and quantification. There is a number of methods for creating so called *stranded* RNAseq libraries that preserve the strand information (for an excellent overview see Levin et al. [2010](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3005310/)): ->![](../images/stranded_protocols.png) +>![](../../images/stranded_protocols.png) > >**Generation of stranded RNAseq libraries**
    >Different types of stranded library generation protocols from [Levin:2010](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3005310/) Depending on the approach and whether one performs single- or paired-end sequencing there are multiple possibilities on how to interpret the results of mapping of these reads onto genome/transcriptome: ->[![](../images/lib_type.png)](http://sailfish.readthedocs.org/en/master/library_type.html) +>[![](../../images/lib_type.png)](http://sailfish.readthedocs.org/en/master/library_type.html) > >**Effects of RNAseq library types**
    >Image and description below is from [Sailfish documentation](http://sailfish.readthedocs.org/en/master/library_type.html) @@ -94,7 +94,7 @@ However, in practice, if you use Illumina paired-end RNAseq protocols you are un The implication of stranded RNAseq is that you can distinguish whether the reads are derived from forward- or reverse-encoded transcripts: ->![](../images/stranded_result.png) +>![](../../images/stranded_result.png) > >**Stranded RNAseq data look like this**
    >This example contrasts unstranded and stranded RNAseq experiments. Red transcripts are from + strand and blue are from - strand. In stranded example reads are clearly stratified between the two strands. A small number of reads from opposite strand may represent anti-sense transcription. The image from GATC Biotech. @@ -123,13 +123,13 @@ After sequencing is performed you have a collection of sequencing reads for each [Tophat](http://bioinformatics.oxfordjournals.org/content/25/9/1105.abstract) was one of the first tools designed specifically to address this problem by identifying potential exons using reads that do map to the genome, generating possible splices between neighboring exons, and comparing reads that did not initially map to the genome agaisnt these *in silico* created junctions: ->[![](../images/tophat.png)](http://bioinformatics.oxfordjournals.org/content/25/9/1105/F1.expansion.html) +>[![](../../images/tophat.png)](http://bioinformatics.oxfordjournals.org/content/25/9/1105/F1.expansion.html) > >**TopHat and TopHat2: Mapping RNAseq regions to genome**
    >In TopHat reads are mapped against the genome and are separated into two categories: (1) those that map, and (2) those that initially unmapped (IUM). "Piles" of reads representing potential exons are extended in search of potential donor/acceptor splice sites and potential splice junctions are reconstructed. IUMs are then mapped to these junctions. Image from [Trapnell:2009](http://bioinformatics.oxfordjournals.org/content/25/9/1105.full). ->[![](../images/tophat2.png)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) +>[![](../../images/tophat2.png)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) > >**TopHat has been subsequently improved with the development of TopHat2**
    >Image from [Kim:2012](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) summarizes steps involved in aligning of RNAseq reads with TopHat2 @@ -137,7 +137,7 @@ After sequencing is performed you have a collection of sequencing reads for each To further optimize and speed up spliced read alignment Kim at al. [2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) developed [HISAT](http://ccb.jhu.edu/software/hisat2/index.shtml). It uses a set of [FM-indices](https://en.wikipedia.org/wiki/FM-index) consisting one global genome-wide index and a collection of ~48,000 local overlapping 42 kb indices (~55,000 56 kb indices in HiSat2). This allows to find initial seed locations for potential read alignments in the genome using global index and to rapidly refine these alignments using a corresponding local index: ->![](../images/hisat.png) +>![](../../images/hisat.png) > >**Hierarchical Graph FM index in HiSat/HiSat2**
    >A part of the read (blue arrow) is first mapped to the genome using the global FM index. The HiSat then tries to extend the alignment directly utilizing the genome sequence (violet arrow). In (**a**) it succeeds and this read aligned as it completely resides within an exon. In (**b**) the extension hits a mismatch. Now HiSat takes advantage of the local FM index overlapping this location to find the appropriate matting for the remainder of this read (green arrow). The (**c**) shows a combination these two strategies: the beginning of the read is mapped using global FM index (blue arrow), extended until it reaches the end of the exon (violet arrow), mapped using local FM index (green arrow) and extended again (violet arrow). Image from [Kim:2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) @@ -146,7 +146,7 @@ To further optimize and speed up spliced read alignment Kim at al. [2015](https: [STAR aligner](https://github.com/alexdobin/STAR) is a fast alternative for mapping RNAseq reads against genome utilizing uncompressed [suffix array](https://en.wikipedia.org/wiki/Suffix_array). It operates in [two stages](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract). In the first stage it performs seed search: ->![](../images/star.png) +>![](../../images/star.png) > >**STAR's seed search**
    >Here a read is split between two consecutive exons. STAR starts to look for a *maximum mappable prefix* (MMP) from the beginning of the read until it can no longer match continuously. After this point it start to MMP for the unmatched portion of the read (**a**). In the case of mismatches (**b**) and unalignable regions (**c**) MMPs serve as anchors from which to extend alignments. Image from [Dobin:2013](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html). @@ -157,7 +157,7 @@ At the second stage STAR stitches MMPs to generate read-level alignments that (c The previous step - mapping - assigns RNAseq reads to genomic locations and identifies splice junctions from reads that originate from different exons. At transcript reconstruction step this information is taken further in attempt to build transcript models. There is a number of tools for performing this task. A benchmarking paper by [Hayer:2015](http://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html) attempted to compare performance of existing approaches with one of the outcomes shown below: ->[![](../images/rnaseq_comparison.png)](http://bioinformatics.oxfordjournals.org/content/early/2015/09/08/bioinformatics.btv488/F5.large.jpg) +>[![](../../images/rnaseq_comparison.png)](http://bioinformatics.oxfordjournals.org/content/early/2015/09/08/bioinformatics.btv488/F5.large.jpg) > >**Comparison of transcript reconsruction approaches**
    >Here *recall* (the number of correctly constructed forms divided by the total number of real forms) versus *precision* (true positives divided by the sum of true positives and false positives) are plotted for seven transcript assemblers tested on two simulated datasets: *EnsemblPerfect* and *EnsemblRealistic*. The shaded region is indicating suboptimal performance (i.e., the white, unshaded region is "good"). The figure is from [Hayer:2015](http://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html). @@ -168,7 +168,7 @@ Based on these results [Cufflinks](http://cole-trapnell-lab.github.io/cufflinks/ [StringTie](https://ccb.jhu.edu/software/stringtie/) assembles transcripts from spliced read alignemnts produced by tools such as STAR, TopHat, or HISAT and simultaneously estimates their abundances using counts of reads assigned to each transcript. The following images illustrates details of StringTie workflow: ->![](../images/stringtie1.png) +>![](../../images/stringtie1.png) > >**StringTie workflow**
    >Image from [Pertea:2015](StringTie enables improved reconstruction of a transcriptome from RNA-seq reads) @@ -187,21 +187,21 @@ To associate reads with transcripts they (the reads) need to be aligned to the t [Sailfish](http://www.cs.cmu.edu/~ckingsf/software/sailfish/) has been initially designed to utilize [*k*-mer](https://en.wikipedia.org/wiki/K-mer) matching for finding association between reads and corresponding transcripts: ->![](../images/sailfish.png) +>![](../../images/sailfish.png) > >**Assigning reads to transcripts: Sailfish**
    >Sailfish indexes input transcriptome for a fixed *k*-mer length and compares *k*-mers derived from RNAseq reads against this index. Image from [Patro:2014](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4077321/) The current version of Sailfish uses [quasi-alignment](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) to extend exact matches found with *k*-mers: ->![](../images/quasi_aln.png) +>![](../../images/quasi_aln.png) > >**Quasi-alignment of reads in Sailfish**
    >In Sailfish version [0.7.0](https://github.com/kingsfordgroup/sailfish/releases/tag/v0.7.0) and up transcriptome is concatenated into a single sequence using `$` separators from which a [suffix array](https://en.wikipedia.org/wiki/Suffix_array) and a [hash table](https://en.wikipedia.org/wiki/Hash_table) are constructed. A *k*-mer from an RNAseq read (green) is looked up in the hash table, which immediately gives its position in the suffix array allowing to extend the march as described in the legend and the [paper](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf). Image from [Srivastava:2015](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) [Kallisto](http://pachterlab.github.io/kallisto/) also utilizes *k*-mer matching but uses a different data structure. It constructs a [De Bruijn graph](https://en.wikipedia.org/wiki/De_Bruijn_graph) from transcriptome input (pane **b** of the figure below). This graph is different from De Bruijn graphs used for genome assembly in that its nodes are *k*-mers and transcripts correspond to paths through the graph. To accommodate multiple transcripts that can lay along the same path (or sub-path) the paths are "colored" with each transcript given a distinct "color" (in genome assembly the graph is built from the reads and nodes usually correspond to overlaps between *k*-mers forming incoming and outgoing edges). Non-branching sections of the graph that have identical coloring are "glued" into contigs. Finally a [hash table](https://en.wikipedia.org/wiki/Hash_table) is built that stores the position of each transcriptome *k*-mer within the graph: ->![](../images/kallisto.png) +>![](../../images/kallisto.png) > >**Assigning reads to transcripts: Kallisto**
    >Here a black read is being associated with a set consisting of red, blue, and green transcripts (**a**). First, a graph is built from transcriptome (**b**). Next, by finding common *k*-mers between the read and the graph the read is "threaded" along a path (**c** and **d**). The colors along that path would indicate which transcripts it is likely derived from. Specifically, this is done by taking intersection of "colors" (**c**). It this case the read is assigned to two transcripts: red and blue. Image from [Bray:2015](http://arxiv.org/pdf/1505.02710v2.pdf) @@ -209,7 +209,7 @@ The current version of Sailfish uses [quasi-alignment](http://biorxiv.org/conten [Salmon](https://combine-lab.github.io/salmon/about/) does not use *k*-mer matching approach. Instead it creates [bwa](https://github.com/lh3/bwa)-like [FM-index](https://en.wikipedia.org/wiki/FM-index) and uses it to finds chains of *Maximal Exact Matches* (MEMs) and *Super Maximal Exact Matches* (SMEMs) between a read and the transcriptome. [Patro:2015](http://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) define a MEM as "*a substring that is shared by the query (read) and reference (transcript) that cannot be extended in either direction without introducing a mismatch*". Similraly, a SMEM is defined as a "*MEM that is not contained within any other MEM on the query.*" One of the advantages of utilizing the FM-index is that a new index does not need to re-generated for a search with different set of parameters. In the case of Sailfish and Kallisto an index is dependent on *k*-mer length and has to be recomputed every time the *k* is changed. The overall schematics of Salmon operation is as follows: ->![](../images/salmon.png) +>![](../../images/salmon.png) > >**Assigning reads to transcripts: Salmon**
    >Image from [Patro:2015](http://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) @@ -222,7 +222,7 @@ Once reads are apportioned across individual transcripts they can be quantified. StringTie, which performs assembly and quantification simultaneously converts splice graph into a flow network for which it solves [the maximum flow problem](https://en.wikipedia.org/wiki/Maximum_flow_problem). The maximum flow is such network represents the expression level for a given transcript: ->![](../images/stringtie2.png) +>![](../../images/stringtie2.png) > >**StringTie flow network**
    >Here each exon node from the splice graph is split into *in* and *out* nodes connected with an edge weighted by the number of reads corresponding to that exon. For example, the first exon is covered by seven reads and so the edge between 1-in and 1-out has a weight of 7. Expression level would correspond to the maximum flow through a path representing a given transcript. Image from [Pertea:2015](StringTie enables improved reconstruction of a transcriptome from RNA-seq reads) @@ -245,7 +245,7 @@ The Expectation/Maximization framework (EM) is utilized in a number of tools suc During next expectation stage read are re-apportioned across transcripts and the procedure is repeated until convergence: ->![](../images/em.png) +>![](../../images/em.png) > >**Expectation Maximization (EM)**
    >Image from [Pacher:2011](http://arxiv.org/pdf/1104.3889v2.pdf) @@ -259,7 +259,7 @@ As we've seen above quantification for a transcript is estimated using the numbe In their [tutorial](http://chagall.med.cornell.edu/RNASEQcourse/) Dündar et al. have compiled a table summarizing various metrics. Below is description of normalization technique for within sample comparisons (between sample comparison can be found in the next section on differential expression analysis): ->![](../images/within_norm.png) +>![](../../images/within_norm.png) > >**RNAseq normalization metrics: Within sample comparisons**
    >Table from Dündar et al. [2015](http://chagall.med.cornell.edu/RNASEQcourse/) @@ -277,7 +277,7 @@ The goal of differential expression analysis (DE) is to find gene (DGE) or trans For this expression is estimated from read counts and attempts are made to correct for variability in measurements using replicates that are absolutely essential accurate results (see below). We begin our short discussion on DE by reproducing a figure from [Trapnell:2013](http://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) highlighting some of the challenges associated with judging expression differences from read counts: ->![](../images/diff.png) +>![](../../images/diff.png) > >**Differential expression: Read counts and Expression levels**
    >**Change in fragment count for a gene does not necessarily equal a change in expression**. (**a**) Simple read-counting schemes sum the fragments incident on a gene’s exons. The exon-union model counts reads falling on any of a gene’s exons, whereas the exon-intersection model counts only reads on constitutive exons. (**b**) Both of the exon-union and exon intersection counting schemes may incorrectly estimate a change in expression in genes with multiple isoforms. The true expression is estimated by the sum of the length-normalized isoform read counts. The discrepancy between a change in the union or intersection count and a change in gene expression is driven by a change in the abundance of the isoforms with respect to one another. In the top row, the gene generates the same number of reads in conditions A and B, but in condition B, all of the reads come from the shorter of the two isoforms, and thus the true expression for the gene is higher in condition B. The intersection count scheme underestimates the true change in gene expression, and the union scheme fails to detect the change entirely. In the middle row, the intersection count fails to detect a change driven by a shift in the dominant isoform for the gene. The union scheme detects a shift in the wrong direction. In the bottom row, the gene’s expression is constant, but the isoforms undergo a complete switch between conditions A and B. Both simplified counting schemes register a change in count that does not reflect a change in gene expression. Figure from [Trapnell:2013] (http://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) @@ -311,11 +311,11 @@ Here is what to do to load the data: >Go to the [data library](https://usegalaxy.org/library/list#folders/Ff4ce53393dae30ee) and select all fastq files. Then Click `to History` button: > ->![](../images/rnaseq_library.png) +>![](../../images/rnaseq_library.png) > >The datasets will appear in your history: > ->![](../images/rnaseq_data_in_history.png) +>![](../../images/rnaseq_data_in_history.png) > >Twelve datasets make a lot of clicking necessary. To avoid this annoyance we will combine them into two collections - **c1** and **c2** as shown in the video below. Also, see this [tutorial](collections.html) for yet another explanation of dataset collections. >
    @@ -334,29 +334,29 @@ We will map the reads with TopHat2. Select **TopHat** from **NGS: RNA Analysis** >* **TopHat settings to use** = `Full parameter list` This is done to be able to specify the strandedness of the library. >* **Library Type** = `FR First Strand` > ->![](../images/tophat_interface.png) +>![](../../images/tophat_interface.png) > >The same procedure is then repeated for collection **c2**. In the end it generates a lot of datasets in the history resulting in something resembling an image below. TopHat produces five types of output and because we started with dataset collections every one of the green boxes shown below is actually a collection of outputs for **c1** and **c2**, respectively. > ->![](../images/tophat_output.png) +>![](../../images/tophat_output.png) Let's now take a look at some of the alignments. We will use IGV for this purpose. >First, let's drill down to actual alignments produced by TopHat. For example, in figure shown above simply click on **TopHat on collection 14: accepted_hits** and you will see a list of datasets corresponding to alignments of reads derived from each conditions: > ->![](../images/accepted_hits_1.png) +>![](../../images/accepted_hits_1.png) > >Now, click on **c2-r1x** and the following will appear: > ->![](../images/accepted_hits_2.png) +>![](../../images/accepted_hits_2.png) > >Finally, use **D. melanogaster** link (highlighted above) and follow the on-screen instructions. By focusing IGV on genomic position `chrX:11,897,111-11,920,446` you will be able to see spliced alignments produced by TopHat: > ->![](../images/igv_tophat.png) +>![](../../images/igv_tophat.png) > >and [sashimi plots](http://software.broadinstitute.org/software/igv/Sashimi) highlighting potential splice junctions: > ->![](../images/sashimi.png) +>![](../../images/sashimi.png) ### Performing differential expression analysis @@ -366,7 +366,7 @@ Using mapped reads produced by TopHat we will perform analysis of differential g [`HTSeq-count`](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html) is one of the most popular tools for gene quantification. `HTseq-count` gives you multiple choices on how to handle read mapping to multiple locations, reads overlapping introns, or reads that overlap more than one genomic feature: ->[![](../images/htseq_count.png)](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html) +>[![](../../images/htseq_count.png)](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html) > >**`HTseq-count` read/feature overlap modes**
    >The `htseq-count` script of the HTSeq suite offers three different modes to handle details of read–feature overlaps that are depicted here. The default of featureCounts is the behavior of the union option. Image is from [HTseq documentation](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html); Caption by [Dündar:2015](http://chagall.med.cornell.edu/RNASEQcourse/) @@ -376,7 +376,7 @@ Before we can use `HTseq-count` we need to download gene annotations for version #### Getting *Drosophila malanogaster* gene annotation from UCSC >Select **UCSC Main** from **Get Data** section of the menu. Within the UCSC Genome Browser interface set parameters as shown below. In particular make sure that **assembly** is set ti `dm3` and **output format** is set to `GTF`. Click **get output**. ->[![](../images/ucsc_dm3.png)](../images/ucsc_dm3.png) +>[![](../../images/ucsc_dm3.png)](../../images/ucsc_dm3.png) > >This [GTF](http://www.ensembl.org/info/website/upload/gff.html) dataset will be used one of the input for HTseq-count. @@ -386,10 +386,10 @@ Before we can use `HTseq-count` we need to download gene annotations for version >`htseq-count` needs strand information to proceed. The strand information is specified as `+`, `-`, or `.` (unknown) in a GTF dataset. `htseq-count` does not like `.` and will generate an error if such unstranded features appear in data. To prevent these errors from happening we will filter them out from GTF file using **Filter** tool from **Filter and Sort** section of tool menu. Here `c7 != "."` means that we need to filter all rows where strand column (column #7) contains a dot: > ->![](../images/filter_gtf.png) +>![](../../images/filter_gtf.png) > >Select **htseq-count** from **NGS: RNA analysis** section on the left side of the menu. Set parameters as shown below. The red arrow points that to enable `htseq-count` to see collections, you need to select the 'folder' button. In the case of this particular Galaxy [history](https://usegalaxy.org/u/aun1/h/rna-seq-tutorial) we will need to run `htseq-count` twice. Once on TopHat alignemnts for collection **c1** (dataset #37; shown below) and then on alignments for collection **c2** (dataset # 57).| -|![](../images/htseq_count_interface.png) +|![](../../images/htseq_count_interface.png) > >This will generate [read counts per gene](https://usegalaxy.org/datasets/bbd44e69cb8906b5d1e80eae6d363142/display/?preview=True). @@ -407,24 +407,24 @@ Before we can use `HTseq-count` we need to download gene annotations for version >The `DESeq2` Galaxy's interface is shown below. `DESeq2` allows to incorporate multiple *factors* in the analysis. In our case we only have one factor, which we call **Conditions**. This is because we are trying to find genes that are differentially expressed between two conditions. The first condition will the first **factor level**, while condition 2 will be the second **factor level**. Here the input for this first factor level is set to a collection `84: htseq-count on collection 37` and the input for the second input is set to `92: htseq-count on collection 57`. Make sure that **Visualising the analysis results** is set to `Yes`: > ->![](../images/deseq2_interface.png) +>![](../../images/deseq2_interface.png) > >This will produce [output](https://usegalaxy.org/datasets/bbd44e69cb8906b5d648fe21c36ac662/display/?preview=True) as shown below. The columns are: (**1**) gene identifier, (**2**) mean normalised counts, averaged over all samples from both conditions, (**3**) logarithm (base 2) of the fold change, (**4**) the standard error estimate for the log2 fold change estimate, (**5**) [Wald test](https://en.wikipedia.org/wiki/Wald_test) statistic, (**6**) p value for the statistical significance of this change, and (**7**) *p*-value adjusted for multiple testing with the Benjamini-Hochberg procedure which controls false discovery rate ([FDR](https://en.wikipedia.org/wiki/False_discovery_rate)). There is only one gene with significant change in gene expression between conditions: `CG1803-RC` with *p*-value = 1.6x10-05 > ->![](../images/deseq2_output.png) +>![](../../images/deseq2_output.png) > >In addition to the [list of genes](https://usegalaxy.org/datasets/bbd44e69cb8906b5d648fe21c36ac662/display/?preview=True) DESeq2 outputs a graphical summary of the result. It includes a number of plots that should be used to evaluate the quality of the experiment. The histogram of *p*-values below shows that in our sample there is in fact just one instance of a significant *p*-value: > ->![](../images/p_val_hist.png) +>![](../../images/p_val_hist.png) > >The [MA plot](https://en.wikipedia.org/wiki/MA_plot) below shows the relationship between the expression change (M) and average expression strength (A). Genes with adjusted *p*-value < 0.1 are in red (there is only one such gene in thi sample at the bottom of the graph): > ->![](../images/MA_plot.png) +>![](../../images/MA_plot.png) > >The Principal Component Analysis ([PCA](https://en.wikipedia.org/wiki/Principal_component_analysis)) shows the separation between Condition 1 and 2. This type of plot is useful for visualizing the overall effect of experimental covariates and batch effects (each replicate is plotted as an individual data point): > ->![](../images/pca.png) +>![](../../images/pca.png) > >A heatmap of sample-to-sample distance matrix gives us an overview over similarities and dissimilarities between samples: > ->![](../images/euc_dist.png) +>![](../../images/euc_dist.png) From 09a49fa1afd699a75e2c43c21ef436329775f97c Mon Sep 17 00:00:00 2001 From: shiltemann Date: Fri, 7 Jul 2017 17:48:43 +0200 Subject: [PATCH 29/35] fix image links in usegalaxy tutorials --- topics/usegalaxy/tutorials/history/tutorial.md | 42 +++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/topics/usegalaxy/tutorials/history/tutorial.md b/topics/usegalaxy/tutorials/history/tutorial.md index 00ba0e35..c71b50d8 100644 --- a/topics/usegalaxy/tutorials/history/tutorial.md +++ b/topics/usegalaxy/tutorials/history/tutorial.md @@ -31,7 +31,7 @@ clear you sessions - that history will be lost!** We can not recover it for you ### Current history controls -![](../images/current-history-buttons.png) +![](../../images/current-history-buttons.png) Above the current history panel are three buttons: the refresh, history options, and 'view all histories' button. @@ -42,7 +42,7 @@ The history options button opens the history options menu which allows you to pe The 'view all histories' button sends you to the interface for -[managing multiple histories](../images/index.md#managing_multiple_histories). +[managing multiple histories](../../images/index.md#managing_multiple_histories). ## History Information @@ -58,7 +58,7 @@ All histories begin with the name 'Unnamed history'. Non-anonymous users can ren 3. Press 'Enter' to save the new name. The input field will disappear and the new name display. 4. To cancel renaming, press 'Esc' or click outside the input field. -![](../images/renaming.png) +![](../../images/renaming.png) ### Tagging a history @@ -76,7 +76,7 @@ To tag a history: 3. Press enter or select one of the previous tags with your arrow keys or mouse. 4. To remove an existing tag, click the small 'X' on the tag or use the backspace key while in the input field. -![](../images/tags.png) +![](../../images/tags.png) ### Annotating a history @@ -94,7 +94,7 @@ To annotate a history: entered since the 'Tab' button is used to switch between controls on the page - tabs can be pasted in however). 4. To save the annotation, click the 'Done' button. -![](../images/annotations.png) +![](../../images/annotations.png) ### History size @@ -122,7 +122,7 @@ There are several different 'states' a dataset can be in: 1. If a previously running or queued job has been paused by Galaxy, the dataset will be in the **paused** state. You can re-start/resume paused jobs using the options menu above the history panel and selecting 'Resume Paused Jobs'. -![](../images/states.png) +![](../../images/states.png) Datasets in the panel are initially shown in a 'summary' view, that only displays: @@ -137,7 +137,7 @@ Datasets in the panel are initially shown in a 'summary' view, that only display action. For example, the 'edit' button is disabled for datasets that are still queued or running. {: .alert .alert-warning} -![](../images/summary.png) +![](../../images/summary.png) You can click the dataset name and the view will expand to show more details: @@ -148,7 +148,7 @@ You can click the dataset name and the view will expand to show more details: 1. a row of buttons that allow further actions on the dataset 1. a **peek** of the data: a couple of rows of data with the column headers (if available) -![](../images/details.png) +![](../../images/details.png) **Note:** many of these details are only displayed if the dataset has finished running, is in the 'ok' state, and @@ -167,7 +167,7 @@ history has hidden datasets, the number will appear there (e.g. '3 hidden') as a the hidden datasets are shown. Each hidden dataset has a link in the top of the summary view that allows you to unhide it. You can click that link again (which will now be 'hide hidden') to make them not shown again. -![](../images/hide.png) +![](../../images/hide.png) ### Deleting and undeleting datasets @@ -178,7 +178,7 @@ link. Clicking this link (e.g. '3 deleted') will make the deleted datasets visib link for manually undeleting it above its title. You can click that link again (which will now be 'hide deleted') to make them not shown again. -![](../images/delete.png) +![](../../images/delete.png) ### Purging datasets and removing them permanently from Galaxy @@ -211,7 +211,7 @@ You can also hide, delete, and purge multiple datasets at once by **multi-select an action doesn't apply to a selected dataset - like deleting a deleted dataset - nothing will happen.) 1. You can click the multiselect button again to hide the checkboxes again. -![](../images/multiselect.png) +![](../../images/multiselect.png) ### Searching for datasets @@ -234,7 +234,7 @@ For example: **Note:** searches are case-insensitive. For example, `VCF` and `vcf` are equivalent. {: .alert .alert-warning} -![](../images/basic-search.png) +![](../../images/basic-search.png) ### Clearing a search @@ -263,7 +263,7 @@ You can enclose text and include spaces using double quotes: `name="My Dataset" If you find normal searching is showing too many datasets, and not what you're looking for, try the advanced keyword search. -![](../images/adv-search.png) +![](../../images/adv-search.png) ### Search and multiselect @@ -293,22 +293,22 @@ history method is presented here: Click the multi-history icon at the top right of the 'Analyze Data' (home) page. Note: you must be logged in to see the icon and use the multi-history page. You should see all the (non-deleted) histories that you've created. -![](../images/undelete.multihistory-button.png) +![](../../images/undelete.multihistory-button.png) Click the '...' icon button in the grey header at the top of the page. You should see a dialog that presents some options for viewing the histories. Click the 'include deleted histories' option. -![](../images/undelete.multihistory-options.png) +![](../../images/undelete.multihistory-options.png) The page should reload and now both non-deleted and deleted histories will be displayed. Deleted histories will have a small message under their titles stating 'This history has been deleted'. -![](../images/undelete.thishasbeendeleted.png) +![](../../images/undelete.thishasbeendeleted.png) Now click the small button with the down arrow just above the deleted history you want to undelete. Then click the 'Undelete' option there. Your history should now be undeleted. -![](../images/undelete.undelete-button.png) +![](../../images/undelete.undelete-button.png) Click the 'Switch to' button at the top of that history and then click 'done' at the very top left to return to the 'Analyze Data' page. -![](../images/undelete.switchto.png) +![](../../images/undelete.switchto.png) ## Dataset Collections @@ -342,7 +342,7 @@ the forward reads and one file contains the reverse reads. Many bioinformatic to further simplify this by placing both files into on 'Dataset Pair' collection. Only two files will be added to the collection: forward and reverse. -![](../images/pair.png) +![](../../images/pair.png) ### Dataset list @@ -350,7 +350,7 @@ Choose 'Dataset List' when you have a set of files that are of the same type and analysis. The datasets in a dataset list must have unique names (e.g. you cannot have two datasets in a dataset list with the name '1.bed'). -![](../images/list.png) +![](../../images/list.png) ### List of dataset pairs @@ -359,4 +359,4 @@ to create this is currently the most flexible and potentially most complicated. datasets sent to the interface based on the dataset names. You are free to select your own pairs, however, and change the order of the collection. Click the help text at the top of the interface to see more information. -![](../images/list-pairs.png) +![](../../images/list-pairs.png) From 21dacb87745a5212965d5172cb9de6a046777317 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 19:28:46 +0200 Subject: [PATCH 30/35] use https for bioteam --- topics/introduction/tutorials/options-for-using-galaxy/slides.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/introduction/tutorials/options-for-using-galaxy/slides.html b/topics/introduction/tutorials/options-for-using-galaxy/slides.html index 80795ff7..7138f772 100644 --- a/topics/introduction/tutorials/options-for-using-galaxy/slides.html +++ b/topics/introduction/tutorials/options-for-using-galaxy/slides.html @@ -139,7 +139,7 @@ ### Commercial support -- You can buy a [preconfigured Galaxy server](http://bioteam.net/products/galaxy-appliance/) from [BioTeam](http://bioteam.net/) +- You can buy a [preconfigured Galaxy server](https://bioteam.net/products/galaxy-appliance/) from [BioTeam](https://bioteam.net/) - [Globus Genomics](http://globusgenomics.org/) and [GenomeCloud](http://www.genome-cloud.com/) provide cloud-based Galaxy servers - [Commercially provided consulting and training](https://galaxyproject.org/support/commercial/) are also available From ee8cac9bb7c061f5e9e1ef54021528b4f0e2eb6e Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 20:32:49 +0200 Subject: [PATCH 31/35] use https where possible --- CONTRIBUTING.md | 14 ++--- LICENSE.md | 2 +- README.md | 6 +-- _layouts/home.html | 2 +- shared/literature.md | 18 +++---- topics/admin/slides/index.html | 2 +- topics/admin/tutorials/database-schema/tutorial.md | 6 +-- .../admin/tutorials/dev-to-production/tutorial.md | 2 +- topics/admin/tutorials/galaxy-docker/slides.html | 2 +- .../tutorials/debruijn-graph-assembly/tutorial.md | 2 +- .../tutorials/general-introduction/tutorial.md | 2 +- topics/chip-seq/slides/index.html | 2 +- topics/chip-seq/tutorials/chip-seq/tutorial.md | 44 +++++++-------- .../tal1-binding-site-identification/tutorial.md | 14 ++--- .../seqtk/tool_test_output.html | 8 +-- topics/dev/tutorials/architecture/slides.html | 18 +++---- topics/dev/tutorials/bioblend-api/slides.html | 8 +-- topics/dev/tutorials/conda/slides.html | 16 +++--- topics/dev/tutorials/containers/slides.html | 2 +- .../tutorials/data-source-integration/tutorial.md | 2 +- .../tutorials/interactive-environments/slides.html | 4 +- topics/dev/tutorials/interactive-tours/slides.html | 4 +- topics/dev/tutorials/tool-integration/slides.html | 8 +-- topics/dev/tutorials/toolshed/slides.html | 4 +- .../dev/tutorials/visualization-charts/tutorial.md | 18 +++---- topics/dev/tutorials/webhooks/slides.html | 2 +- topics/dev/tutorials/webhooks/tutorial.md | 8 +-- .../tutorials/methylation-seq/tutorial.md | 2 +- topics/introduction/slides/index.html | 2 +- .../tutorials/galaxy-intro-collections/tutorial.md | 2 +- .../tutorials/galaxy-intro-peaks2genes/tutorial.md | 4 +- .../tutorials/galaxy-intro-vis/tutorial.md | 2 +- .../tutorials/igv-introduction/tutorial.md | 24 ++++----- .../tutorials/options-for-using-galaxy/slides.html | 10 ++-- .../processing-many-samples-at-once/tutorial.md | 6 +-- topics/metagenomics/slides/index.html | 2 +- .../tutorials/general-tutorial/tutorial.md | 6 +-- .../tutorials/mothur-miseq-sop/tutorial.md | 14 ++--- .../tutorials/database-handling/tutorial.md | 2 +- topics/proteomics/tutorials/ntails/tutorial.md | 4 +- .../tutorials/protein-id-sg-ps/tutorial.md | 2 +- .../tutorials/secretome-prediction/tutorial.md | 6 +-- .../tutorials/de-novo-rad-seq/tutorial.md | 10 ++-- .../tutorials/genetic-map-rad-seq/tutorial.md | 8 +-- .../tutorials/genome-annotation/tutorial.md | 6 +-- .../tutorials/mapping/tutorial.md | 2 +- .../tutorials/quality-control/tutorial.md | 4 +- .../tutorials/ref-based-rad-seq/tutorial.md | 10 ++-- .../create-new-tutorial-docker/tutorial.md | 2 +- .../create-new-tutorial-tours/tutorial.md | 4 +- .../training/tutorials/good-practices/slides.html | 2 +- topics/transcriptomics/README.md | 10 ++-- topics/transcriptomics/slides/index.html | 10 ++-- .../transcriptomics/tutorials/de-novo/tutorial.md | 6 +-- .../tutorials/ref-based/tutorial.md | 32 +++++------ topics/transcriptomics/tutorials/srna/tutorial.md | 2 +- topics/usegalaxy/README.md | 6 +-- topics/usegalaxy/docker/README.md | 2 +- topics/usegalaxy/tutorials/collections/tutorial.md | 2 +- topics/usegalaxy/tutorials/dip/tutorial.md | 28 +++++----- topics/usegalaxy/tutorials/dunovo/tutorial.md | 12 ++--- topics/usegalaxy/tutorials/ngs/tutorial.md | 34 ++++++------ topics/usegalaxy/tutorials/non-dip/tutorial.md | 26 ++++----- topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md | 62 +++++++++++----------- .../tutorials/diploid-variant-calling/tutorial.md | 24 ++++----- .../tutorials/exome-seq/tutorial.md | 4 +- 66 files changed, 308 insertions(+), 308 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e4e7d081..e1128251 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,7 +49,7 @@ Once you are feeling more comfortable, you can propose changes to this training In [issues](https://github.com/galaxyproject/training-material/issues) and [project management system](https://github.com/galaxyproject/training-material/projects), you will find lists of issues to fix and features to change (with the "newcomer-friendly" label for example). Feel free to solve them. -We strongly recommend you read and follow Software Carpentry's recommendations on [lesson design](https://swcarpentry.github.io/lesson-example/01-design/) and [lesson writing](http://swcarpentry.github.io/instructor-training/19-lessons/) if you plan to add or change some training materials, and also to check the [structure of the training material](#how-the-training-material-is-structured). +We strongly recommend you read and follow Software Carpentry's recommendations on [lesson design](https://swcarpentry.github.io/lesson-example/01-design/) and [lesson writing](https://swcarpentry.github.io/instructor-training/19-lessons/) if you plan to add or change some training materials, and also to check the [structure of the training material](#how-the-training-material-is-structured). ## Pull Requests @@ -98,7 +98,7 @@ The `images` directory collects all images/pictures needed for the training mate Images shared between several topics are in the `shared/images` directory at the root. -All images for the slides must be in `images` directory. The images must be in good quality. The sources (`svg` or other) of the images must also be added to the `images` directory. We encourage you to use [yEd](http://www.yworks.com/products/yed) to easily generate diagrams and [Inkscape](https://inkscape.org/en/) for any other images. +All images for the slides must be in `images` directory. The images must be in good quality. The sources (`svg` or other) of the images must also be added to the `images` directory. We encourage you to use [yEd](https://www.yworks.com/products/yed) to easily generate diagrams and [Inkscape](https://inkscape.org/en/) for any other images. ## `slides` directory @@ -112,7 +112,7 @@ This directory collects the tutorials related to the topic, one per subdirectory The templates for the tutorials are different from the other pages to help users to focus on the content of the tutorial. To improve the output of the tutorial, several metadata are mandatory for every tutorials, such as the requirements or the objectives of the tutorials. Boxes are also used to highlight some key points as the hands-on or the tips. -The content of each tutorial is generated with [Jekyll](http://jekyllrb.com/) from a Markdown file and some metadata (e.g. the requirements, the Zenodo link, the questions) defined inside the metadata of the related topic. +The content of each tutorial is generated with [Jekyll](https://jekyllrb.com/) from a Markdown file and some metadata (e.g. the requirements, the Zenodo link, the questions) defined inside the metadata of the related topic. > Want to contribute to a tutorial? > - [Check out how to add a new tutorial?](#how-do-i-add-a-new-tutorial) @@ -130,7 +130,7 @@ The `docker` image must also integrate a Galaxy tour from the [`galaxy-tours` re # How do I add new content? -Most of the content is written in Markdown with some metadata (or variables) stored in YAML. To generate the website, we are using [Jekyll](http://jekyllrb.com/) and its templating system. +Most of the content is written in Markdown with some metadata (or variables) stored in YAML. To generate the website, we are using [Jekyll](https://jekyllrb.com/) and its templating system. So if you want to visualise locally how the website will look like, you need to run a local Jekyll server. So, Jekyll must be installed using [RubyGems](https://rubygems.org/pages/download): @@ -172,7 +172,7 @@ You can then visualize locally ([http://localhost:4000/](http://localhost:4000/) - `maintainers`: the two maintainers of the topic with their `name`, `github_username`, `email` - `contributors`: list of people who contributed to the topic with `name`, `github_username`, `email` - This information is used with [Jekyll](http://jekyllrb.com/) to generate the webpage related to the topic + This information is used with [Jekyll](https://jekyllrb.com/) to generate the webpage related to the topic 3. Fill the introduction slides @@ -200,7 +200,7 @@ You can then visualize locally ([http://localhost:4000/](http://localhost:4000/) - `time_estimation`: estimation of the time needed to complete the hands-on - `key_points`: take home messages - This information will appear in the top and bottom of the online hands-on generated using [Jekyll](http://jekyllrb.com/) + This information will appear in the top and bottom of the online hands-on generated using [Jekyll](https://jekyllrb.com/) ![](shared/images/tutorial_header.png) @@ -223,7 +223,7 @@ You can then visualize locally ([http://localhost:4000/](http://localhost:4000/) 2. Fill the `tutorial.md` with the tutorial -The content of a tutorial hands-on is written in Markdown. They are rendered by [Jekyll](http://jekyllrb.com/) into the webpage for the tutorial. +The content of a tutorial hands-on is written in Markdown. They are rendered by [Jekyll](https://jekyllrb.com/) into the webpage for the tutorial. The header of the file must be something like: diff --git a/LICENSE.md b/LICENSE.md index 4aee8d7c..f1ce8258 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1 +1 @@ -This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +This work is licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit https://creativecommons.org/licenses/by/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. diff --git a/README.md b/README.md index 156e4273..aac4a0fd 100644 --- a/README.md +++ b/README.md @@ -28,15 +28,15 @@ The Galaxy community offers many different ways of training. The table above lis # License -This work is licensed under the [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/). +This work is licensed under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/). # Acknowledgment and Funding We would like to thank all contributors to our Galaxy courses, especially those mentioned in the [Contributors list](CONTRIBUTORS.md), the Galaxy community for their constant support and our funding partners. - + -   +   --- You want to help us on this project? Please, see the [`CONTRIBUTING`](CONTRIBUTING.md) file. diff --git a/_layouts/home.html b/_layouts/home.html index fe9324bb..baaa728b 100644 --- a/_layouts/home.html +++ b/_layouts/home.html @@ -77,7 +77,7 @@

    - +
    diff --git a/shared/literature.md b/shared/literature.md index e5236a75..a7598e0c 100644 --- a/shared/literature.md +++ b/shared/literature.md @@ -2,36 +2,36 @@ ##Deep sequencing -**Zentner and Henikoff (2012):** [Surveying the epigenomic landscape, one base at a time](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2012-13-10-250), (doi:10.1186/gb-2012-13-10-250) - Overview of popular *-seq techniques; very nice description of DNase-seq, MNase-seq, FAIRE-seq etc. +**Zentner and Henikoff (2012):** [Surveying the epigenomic landscape, one base at a time](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2012-13-10-250), (doi:10.1186/gb-2012-13-10-250) - Overview of popular *-seq techniques; very nice description of DNase-seq, MNase-seq, FAIRE-seq etc. **Son and Taylor (2011):** [Preparing DNA Libraries for Multiplexed Paired-End Deep Sequencing for Illumina GA Sequencers](https://www.ncbi.nlm.nih.gov/pubmed/21400673), (doi:10.1002/9780471729259.mc01e04s20) - Paper on multiplexing; describes the individual steps of the Illumina deep sequencing protocols quite in detail -**Illumina's technical report** - focuses on [Illumina's sequencing technology](http://www.illumina.com/technology.html); nice educative figures +**Illumina's technical report** - focuses on [Illumina's sequencing technology](https://www.illumina.com/technology.html); nice educative figures ##NGS data formats - UCSC has a very good overview with brief descriptions of BED, bedGraph, bigWig etc.: https://genome.ucsc.edu/FAQ/FAQformat.html -- [VCF format](http://gatkforums.broadinstitute.org/gatk/discussion/1268/how-should-i-interpret-vcf-files-produced-by-the-gatk) (encoding SNPs, indels etc.): Very readable, albeit not exhausting description +- [VCF format](https://gatkforums.broadinstitute.org/gatk/discussion/1268/how-should-i-interpret-vcf-files-produced-by-the-gatk) (encoding SNPs, indels etc.): Very readable, albeit not exhausting description - Transcriptomes are often saved in [GFF3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) (this is what TopHat needs, for example), but just to make things more complicated, GTF is another format used for transcriptome information, too ##Bioinformatic Tools (Linux, R, BEDTools etc.) - Manuals, courses, original papers -- Why and how is bioinformatics software special? **Altschul et a. (2013)** [The anatomy of successful computational biology software](http://www.ncbi.nlm.nih.gov/pubmed/24104757), (doi:10.1038/nbt.2721) **(Highly recommended to read!)** -- **Bild et al. (2014)** [A Field Guide to Genomics Research](http://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1001744), (doi:10.1371/journal.pbio.1001744) - Very readable introduction about the different caveats of genomics research (with cute cartoons!) +- Why and how is bioinformatics software special? **Altschul et a. (2013)** [The anatomy of successful computational biology software](https://www.ncbi.nlm.nih.gov/pubmed/24104757), (doi:10.1038/nbt.2721) **(Highly recommended to read!)** +- **Bild et al. (2014)** [A Field Guide to Genomics Research](https://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1001744), (doi:10.1371/journal.pbio.1001744) - Very readable introduction about the different caveats of genomics research (with cute cartoons!) **Linux Command Line** -- [Linux & Perl Primer for Biologists](http://korflab.ucdavis.edu/Unix_and_Perl/unix_and_perl_v3.1.1.html) - Very entertaining introduction to command line commands and perl scripts with a focus on bioinformatic application, i.e. handling of DNA sequences +- [Linux & Perl Primer for Biologists](https://korflab.ucdavis.edu/Unix_and_Perl/unix_and_perl_v3.1.1.html) - Very entertaining introduction to command line commands and perl scripts with a focus on bioinformatic application, i.e. handling of DNA sequences - [Linux Tutorial for Beginners](http://www.ee.surrey.ac.uk/Teaching/Unix/) - Thorough, but concise online tutorial introducing the very basics of handling the Linux command line -- [Writing Linux shell scripts](http://www.freeos.com/guides/lsst/index.html) - Useful for slightly more advanced Linux command line users +- [Writing Linux shell scripts](https://www.freeos.com/guides/lsst/index.html) - Useful for slightly more advanced Linux command line users **R** -- [Hands on R course](http://www.uwyo.edu/mdillon/hor.html) - For beginners - R is probably the most widely used open-source statistical software; through our epicenter website you can also access RStudio which provides are very nice interface to working and plotting with R. In fact, most of the plots generated within Galaxy are generated through R scripts, so if you're not happy with the default formats of the Galaxy graphs, definitely have a look at R yourself. The learning curve is steep, but it is worth it. +- [Hands on R course](https://www.uwyo.edu/mdillon/hor.html) - For beginners - R is probably the most widely used open-source statistical software; through our epicenter website you can also access RStudio which provides are very nice interface to working and plotting with R. In fact, most of the plots generated within Galaxy are generated through R scripts, so if you're not happy with the default formats of the Galaxy graphs, definitely have a look at R yourself. The learning curve is steep, but it is worth it. **BEDTools** -- [BEDTools Manual](http://bedtools.readthedocs.org) - When working with genomic intervals (e.g. genes, peaks, enriched regions...), BEDTools are invaluable! The manual is a very good read and we refer to it almost daily. +- [BEDTools Manual](https://bedtools.readthedocs.org) - When working with genomic intervals (e.g. genes, peaks, enriched regions...), BEDTools are invaluable! The manual is a very good read and we refer to it almost daily. diff --git a/topics/admin/slides/index.html b/topics/admin/slides/index.html index a70cfe16..e4288e61 100644 --- a/topics/admin/slides/index.html +++ b/topics/admin/slides/index.html @@ -11,7 +11,7 @@ ### Several different ways -- [Main](http://usegalaxy.org/) and [others](https://galaxyproject.org/public-galaxy-servers/) free public servers +- [Main](https://usegalaxy.org/) and [others](https://galaxyproject.org/public-galaxy-servers/) free public servers - Run your [local instance](https://galaxyproject.org/admin/get-galaxy/) or a [docker](https://github.com/bgruening/docker-galaxy-stable) image - Use Galaxy on the [Cloud](https://galaxyproject.org/cloud/) - Get the SlipStream [Galaxy Appliance](https://bioteam.net/products/galaxy-appliance/) diff --git a/topics/admin/tutorials/database-schema/tutorial.md b/topics/admin/tutorials/database-schema/tutorial.md index cf5cc5c5..6783c2c1 100644 --- a/topics/admin/tutorials/database-schema/tutorial.md +++ b/topics/admin/tutorials/database-schema/tutorial.md @@ -31,7 +31,7 @@ The session description is database centric and we’ll be focusing on the relat database that backs Galaxy servers. But that’s only half the picture of the this data. The other is the object model which is the object-oriented view of this same data. The object model is used by the code to manipulate and access the database. -The translation between the two worlds is handled by an object-relational mapping implemented with SQLAlchemy (http://www.sqlalchemy.org). +The translation between the two worlds is handled by an object-relational mapping implemented with SQLAlchemy (https://www.sqlalchemy.org). Today we are covering the database and how to access it with SQL. We aren’t going to cover the corresponding object model or object relational mapping. @@ -55,7 +55,7 @@ What’s not in the database is the data. Datasets are stored outside the databa #### ER diagrams and SchemaSpy -Entity-relationship diagrams are a way to understand tables and the relationships between them inside a relational database. SchemaSpy (http://schemaspy.sourceforge.net/) is a free (and remarkable tool) for generating ER diagrams. We’be used it generate a description of the database backing the server in this container. See +Entity-relationship diagrams are a way to understand tables and the relationships between them inside a relational database. SchemaSpy (https://schemaspy.sourceforge.net/) is a free (and remarkable tool) for generating ER diagrams. We’be used it generate a description of the database backing the server in this container. See https://galaxyproject.org/schema/SchemaSpy/index.html @@ -327,7 +327,7 @@ https://docs.google.com/presentation/d/1l4DD0IaJjuvk1zAT1Sjv26bLyrSOg3VUm7rD-TQl To run SchemaSpy in your container you’ll need to get it, and also install some required software packages. > ```sh -> wget http://downloads.sourceforge.net/project/schemaspy/schemaspy/SchemaSpy%205.0.0/schemaSpy_5.0.0.jar +> wget https://downloads.sourceforge.net/project/schemaspy/schemaspy/SchemaSpy%205.0.0/schemaSpy_5.0.0.jar > apt-get update > apt-get install libpostgresql-jdbc-java > apt-get install graphviz diff --git a/topics/admin/tutorials/dev-to-production/tutorial.md b/topics/admin/tutorials/dev-to-production/tutorial.md index 8efb1391..67641924 100644 --- a/topics/admin/tutorials/dev-to-production/tutorial.md +++ b/topics/admin/tutorials/dev-to-production/tutorial.md @@ -63,7 +63,7 @@ To bind Galaxy to any avalaible network interface edit the config/galaxy.ini fil ## What did you just installed ? The galaxy you have just installed is configured with the following: -- [SQLite](http://www.sqlite.org/): a servless database. +- [SQLite](https://www.sqlite.org/): a servless database. - A built-in HTTP server, written in Python. The tools are run locally and the galaxy server itself run in a single process. diff --git a/topics/admin/tutorials/galaxy-docker/slides.html b/topics/admin/tutorials/galaxy-docker/slides.html index e27ae2eb..7f81602e 100644 --- a/topics/admin/tutorials/galaxy-docker/slides.html +++ b/topics/admin/tutorials/galaxy-docker/slides.html @@ -682,7 +682,7 @@ docker run -i -t -p 8080:80 quay.io/bgruening ``` -http://bgruening.github.io/docker-galaxy-stable/ +https://bgruening.github.io/docker-galaxy-stable/ --- diff --git a/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md b/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md index a5c55c82..a3373d8d 100644 --- a/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md +++ b/topics/assembly/tutorials/debruijn-graph-assembly/tutorial.md @@ -7,7 +7,7 @@ tutorial_name: debruijn-graph-assembly # Optimised de Bruijn Graph assemblies using the Velvet Optimiser and SPAdes In this activity, we will perform *de novo* assemblies of a short read set using the Velvet Optimiser and the SPAdes assemblers. We are using the Velvet Optimiser for illustrative purposes. For real assembly work, a more suitable assembler should be chosen - such as SPAdes. -The Velvet Optimiser is a script written by Simon Gladman to optimise the k-mer size and coverage cutoff parameters for Velvet. More information can be found [here](http://github.com/slugger70/VelvetOptimiser) +The Velvet Optimiser is a script written by Simon Gladman to optimise the k-mer size and coverage cutoff parameters for Velvet. More information can be found [here](https://github.com/slugger70/VelvetOptimiser) SPAdes is a de novo genome assembler written by Pavel Pevzner's group in St. Petersburg. More details on it can be found [here](http://cab.spbu.ru/software/spades/) diff --git a/topics/assembly/tutorials/general-introduction/tutorial.md b/topics/assembly/tutorials/general-introduction/tutorial.md index 2c987085..3fccf3b6 100644 --- a/topics/assembly/tutorials/general-introduction/tutorial.md +++ b/topics/assembly/tutorials/general-introduction/tutorial.md @@ -121,7 +121,7 @@ The FastQC tool: Although we have warnings for two outputs (per base sequence content; Kmer content), we can ignore these for now. -For a fuller discussion of FastQC outputs and warnings, see the [FastQC website link](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/), including the section on each of the output [reports](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/), and examples of ["good"](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html) and ["bad"](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/bad_sequence_fastqc.html) Illumina data. +For a fuller discussion of FastQC outputs and warnings, see the [FastQC website link](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), including the section on each of the output [reports](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/3%20Analysis%20Modules/), and examples of ["good"](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/good_sequence_short_fastqc.html) and ["bad"](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/bad_sequence_fastqc.html) Illumina data. We won’t be doing anything to these data to clean it up as there isn’t much need. Therefore we will get on with the assembly! diff --git a/topics/chip-seq/slides/index.html b/topics/chip-seq/slides/index.html index c0f6b677..3e9880b3 100644 --- a/topics/chip-seq/slides/index.html +++ b/topics/chip-seq/slides/index.html @@ -21,7 +21,7 @@ ![](../images/szalkowski_schmid_2011.jpg) -[*Szalkowski & Schmid, Brief Bioinform, 2011*](http://bib.oxfordjournals.org/content/12/6/626.long) +[*Szalkowski & Schmid, Brief Bioinform, 2011*](https://bib.oxfordjournals.org/content/12/6/626.long) --- diff --git a/topics/chip-seq/tutorials/chip-seq/tutorial.md b/topics/chip-seq/tutorials/chip-seq/tutorial.md index e4cf1b52..e5c19d76 100644 --- a/topics/chip-seq/tutorials/chip-seq/tutorial.md +++ b/topics/chip-seq/tutorials/chip-seq/tutorial.md @@ -26,7 +26,7 @@ The slides for part 2 can be downloaded from here ## Hands on example -This exercise uses the dataset from the Nature publication by [Ross-Inness et al., 2012](http://www.ncbi.nlm.nih.gov/pubmed/22217937). +This exercise uses the dataset from the Nature publication by [Ross-Inness et al., 2012](https://www.ncbi.nlm.nih.gov/pubmed/22217937). The goal was to identify the binding sites of the Estrogen receptor, a transcription factor known to be associated with different types of breast cancer. To this end, ChIP-seq was performed in breast cancer cells from 4 patients of different outcomes (good and poor). For each ChIP-seq experiment there is a matching technical control, i.e., there are 8 samples in total, half of which are the so-called 'input' samples for which the same treatment as the ChIP-seq samples was done except for the immunoprecipitation step. @@ -42,7 +42,7 @@ Create a new history for this exercise. - Have a look at the file by clicking on the 'eye' icon. There is a lot of text, but can you spot where the DNA sequence is stored? Can you guess what the other entries mean? -- Run the tool `FastQC` on one of the two FASTQ files to control the quality of the reads. An explanation of the results can be found on the [FastQC web page](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +- Run the tool `FastQC` on one of the two FASTQ files to control the quality of the reads. An explanation of the results can be found on the [FastQC web page](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). **Step 2: Trimming and clipping of reads** @@ -102,7 +102,7 @@ We expect that the replicates of the ChIP-seq experiments should be clustered mo - After the computation is done, run `plotCorrelation` from the deepTools package to visualize the results. Feel free to try different parameters. -More information on these two tools can be found at the [deepTools documentation page](http://deeptools.readthedocs.io/en/latest/content/list_of_tools.html). +More information on these two tools can be found at the [deepTools documentation page](https://deeptools.readthedocs.io/en/latest/content/list_of_tools.html). **Step 7: GC bias assessment** @@ -122,7 +122,7 @@ Does this dataset have a GC bias? - Set as fragment size 100. - Limit the operation to only one chromosome. -What do you think about the quality of the IP for this experiment? If you are not sure how to interpret the resulting plots, please read the information [here](http://deeptools.readthedocs.io/en/latest/content/tools/plotFingerprint.html#background) +What do you think about the quality of the IP for this experiment? If you are not sure how to interpret the resulting plots, please read the information [here](https://deeptools.readthedocs.io/en/latest/content/tools/plotFingerprint.html#background) **Step 9: Generate coverage files normalized by sequencing depth** @@ -204,52 +204,52 @@ Use the regions provided by the gene annotation file downloaded from UCSC and yo ###ChIP-seq in general: -**Landt et al. (2012):** [ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia](http://genome.cshlp.org/content/22/9/1813.long), (doi:10.1101/gr.136184.111) - This is a very useful "encyclopedic" paper with many details about the tools the (mod)ENCODE consortia use. It also contains a long section about antibody validation etc.. It does not explain much of the reasoning behind the bioinformatics tools, though. +**Landt et al. (2012):** [ChIP-seq guidelines and practices of the ENCODE and modENCODE consortia](https://genome.cshlp.org/content/22/9/1813.long), (doi:10.1101/gr.136184.111) - This is a very useful "encyclopedic" paper with many details about the tools the (mod)ENCODE consortia use. It also contains a long section about antibody validation etc.. It does not explain much of the reasoning behind the bioinformatics tools, though. -**Zentner and Henikoff (2012):** [Surveying the epigenomic landscape, one base at a time](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2012-13-10-250), (doi:10.1186/gb-2012-13-10-250) - Overview of popular *-seq techniques; very nice description of DNase-seq, MNase-seq, FAIRE-seq etc. +**Zentner and Henikoff (2012):** [Surveying the epigenomic landscape, one base at a time](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2012-13-10-250), (doi:10.1186/gb-2012-13-10-250) - Overview of popular *-seq techniques; very nice description of DNase-seq, MNase-seq, FAIRE-seq etc. -**Kidder et al. (2011):** [Technical considerations to obtaining high-quality data](http://www.nature.com/ni/journal/v12/n10/abs/ni.2117.html), (doi:10.1038/ni.2117) - Nice, readable introduction into all aspects of ChIP-seq experiments (from antibodies to cell numbers to replicates to data analysis) +**Kidder et al. (2011):** [Technical considerations to obtaining high-quality data](https://www.nature.com/ni/journal/v12/n10/abs/ni.2117.html), (doi:10.1038/ni.2117) - Nice, readable introduction into all aspects of ChIP-seq experiments (from antibodies to cell numbers to replicates to data analysis) -**Leleu et al. (2010):** [Processing and analyzing ChIP-seq data](http://www.ncbi.nlm.nih.gov/pubmed/20861161), (doi: 10.1093/bfgp/elq022) - Fairly detailed review of key concepts of ChIP-seq data processing (less detailed on analysis) +**Leleu et al. (2010):** [Processing and analyzing ChIP-seq data](https://www.ncbi.nlm.nih.gov/pubmed/20861161), (doi: 10.1093/bfgp/elq022) - Fairly detailed review of key concepts of ChIP-seq data processing (less detailed on analysis) **Peter Park (2009):** [ChIP-seq: Advantages and challenges of a maturing technology](https://www.ncbi.nlm.nih.gov/pubmed/19736561), (doi:10.1038/nrg2641) -**Kharchenko et al. (2008):** [Design and analysis of ChIP-seq experiments for DNA-binding proteins](http://www.ncbi.nlm.nih.gov/pubmed/19029915), (doi:10.1038/nbt.1508) +**Kharchenko et al. (2008):** [Design and analysis of ChIP-seq experiments for DNA-binding proteins](https://www.ncbi.nlm.nih.gov/pubmed/19029915), (doi:10.1038/nbt.1508) -**Liu et al. (2010):** [Q&A: ChIP-seq technologies and the study of gene regulation](http://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-8-56), (doi:10.1186/1741-7007-8-56) - Short overview of several (typical) issues of ChIP-seq analysis +**Liu et al. (2010):** [Q&A: ChIP-seq technologies and the study of gene regulation](https://bmcbiol.biomedcentral.com/articles/10.1186/1741-7007-8-56), (doi:10.1186/1741-7007-8-56) - Short overview of several (typical) issues of ChIP-seq analysis -**Carroll et al. (2014):** [Impact of artifact removal on ChIP quality metrics in ChIP-seq and ChIP-exo data](http://journal.frontiersin.org/article/10.3389/fgene.2014.00075/full),(doi:10.3389/fgene.2014.00075) +**Carroll et al. (2014):** [Impact of artifact removal on ChIP quality metrics in ChIP-seq and ChIP-exo data](https://journal.frontiersin.org/article/10.3389/fgene.2014.00075/full),(doi:10.3389/fgene.2014.00075) ###Peak Calling Methods (ChIP-seq) -**Pepke et al. (2009):** [Computation for ChIP-seq and RNA-seq studies](http://www.ncbi.nlm.nih.gov/pubmed/19844228), (doi: 10.1038/nmeth.1371) - First comparison of peak callers, focuses on the explanation of basic principles of ChIP-seq data processing and general workflows of peak calling algorithms +**Pepke et al. (2009):** [Computation for ChIP-seq and RNA-seq studies](https://www.ncbi.nlm.nih.gov/pubmed/19844228), (doi: 10.1038/nmeth.1371) - First comparison of peak callers, focuses on the explanation of basic principles of ChIP-seq data processing and general workflows of peak calling algorithms -**Wilbanks et al. (2010):** [Evaluation of Algorithm Performance in ChIP-Seq Peak Detection](http://www.ncbi.nlm.nih.gov/pubmed/20628599), (doi: 10.1371/journal.pone.0011471) - Another comparison of peak callers - focuses more on the evaluation of the peak callers performances than Pepke et al. (2009) +**Wilbanks et al. (2010):** [Evaluation of Algorithm Performance in ChIP-Seq Peak Detection](https://www.ncbi.nlm.nih.gov/pubmed/20628599), (doi: 10.1371/journal.pone.0011471) - Another comparison of peak callers - focuses more on the evaluation of the peak callers performances than Pepke et al. (2009) -**Micsinai et al. (2012):** [Picking ChIP-seq peak detectors for analyzing chromatin modification experiments](http://www.ncbi.nlm.nih.gov/pubmed/22307239), (doi: 10.1093/nar/gks048) - How to choose the best peak caller for your data set - their finding: default parameters, surprisingly, yield the most reproducible results regardless of the data set type +**Micsinai et al. (2012):** [Picking ChIP-seq peak detectors for analyzing chromatin modification experiments](https://www.ncbi.nlm.nih.gov/pubmed/22307239), (doi: 10.1093/nar/gks048) - How to choose the best peak caller for your data set - their finding: default parameters, surprisingly, yield the most reproducible results regardless of the data set type #### MACS -**Fen et al. (2012):** [Identifying ChIP-seq enrichment using MACS.](http://www.ncbi.nlm.nih.gov/pubmed/22936215), (doi:10.1038/nprot.2012.101) - How to use MACS - Nature Protocols +**Fen et al. (2012):** [Identifying ChIP-seq enrichment using MACS.](https://www.ncbi.nlm.nih.gov/pubmed/22936215), (doi:10.1038/nprot.2012.101) - How to use MACS - Nature Protocols -**Zhang et al. (2008):** [Model-based Analysis of ChIP-Seq (MACS)](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137), (doi:10.1186/gb-2008-9-9-r137) - The original publication of MACS +**Zhang et al. (2008):** [Model-based Analysis of ChIP-Seq (MACS)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137), (doi:10.1186/gb-2008-9-9-r137) - The original publication of MACS ### DNA motif analysis -**Das et al. (2007):** [A survey of DNA motif finding algorithms](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-S7-S21), (doi:10.1186/1471-2105-8-S7-S21) - Review of motif analysis tools +**Das et al. (2007):** [A survey of DNA motif finding algorithms](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-S7-S21), (doi:10.1186/1471-2105-8-S7-S21) - Review of motif analysis tools #### MEME (suite) -**Machanick and Bailey (2011):** [MEME-ChIP: motif analysis of large DNA datasets](http://www.ncbi.nlm.nih.gov/pubmed/21486936), (doi: 10.1093/bioinformatics/btr189) - MEME-ChIP-paper +**Machanick and Bailey (2011):** [MEME-ChIP: motif analysis of large DNA datasets](https://www.ncbi.nlm.nih.gov/pubmed/21486936), (doi: 10.1093/bioinformatics/btr189) - MEME-ChIP-paper -**Bailey and Machanick (2012):** [Inferring direct DNA binding from ChIP-seq](http://www.ncbi.nlm.nih.gov/pubmed/22610855), (doi:10.1093/nar/gks433) - Centrimo: position-specific motif analysis, especially useful for ChIP-seq data +**Bailey and Machanick (2012):** [Inferring direct DNA binding from ChIP-seq](https://www.ncbi.nlm.nih.gov/pubmed/22610855), (doi:10.1093/nar/gks433) - Centrimo: position-specific motif analysis, especially useful for ChIP-seq data -[TomTom](http://meme-suite.org/tools/tomtom) - Meme Suite Motif comparison tool: tool for the comparison of motifs from databases (not in Galaxy yet): [Manual](http://meme-suite.org/doc/tomtom.html?man_type=web) +[TomTom](https://meme-suite.org/tools/tomtom) - Meme Suite Motif comparison tool: tool for the comparison of motifs from databases (not in Galaxy yet): [Manual](http://meme-suite.org/doc/tomtom.html?man_type=web) #### TRAP -**Thomas-Chollier et al. (2012):** [Transcription factor binding predictions using TRAP for the analysis of ChIP-seq data and regulatory SNPs](http://www.ncbi.nlm.nih.gov/pubmed/22051799), (doi:10.1038/nprot.2011.409) - How to use TRAP - Nature Protocols +**Thomas-Chollier et al. (2012):** [Transcription factor binding predictions using TRAP for the analysis of ChIP-seq data and regulatory SNPs](https://www.ncbi.nlm.nih.gov/pubmed/22051799), (doi:10.1038/nprot.2011.409) - How to use TRAP - Nature Protocols -**Roider et al. (2006):** [Predicting transcription factor affinities to DNA from a biophysical model.](http://www.ncbi.nlm.nih.gov/pubmed/17098775), (doi:10.1093/bioinformatics/btl565) - Theoretical background of TRAP +**Roider et al. (2006):** [Predicting transcription factor affinities to DNA from a biophysical model.](https://www.ncbi.nlm.nih.gov/pubmed/17098775), (doi:10.1093/bioinformatics/btl565) - Theoretical background of TRAP diff --git a/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md b/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md index a7eccd99..4e3a16df 100644 --- a/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md +++ b/topics/chip-seq/tutorials/tal1-binding-site-identification/tutorial.md @@ -6,7 +6,7 @@ tutorial_name: tal1-binding-site-identification # Introduction -This tutorial uses ChIP-seq datasets from a study published by [Wu *et al.* (2014)](http://genome.cshlp.org/content/24/12/1945.full.pdf+html). The goal of this study was to investigate "the dynamics of occupancy and the role in gene regulation of the transcription factor Tal1, a critical regulator of hematopoiesis, at multiple stages of hematopoietic differentiation." +This tutorial uses ChIP-seq datasets from a study published by [Wu *et al.* (2014)](https://genome.cshlp.org/content/24/12/1945.full.pdf+html). The goal of this study was to investigate "the dynamics of occupancy and the role in gene regulation of the transcription factor Tal1, a critical regulator of hematopoiesis, at multiple stages of hematopoietic differentiation." To this end, ChIP-seq experiments were performed in multiple mouse cell types including G1E - a GATA-null immortalized cell line derived from targeted disruption of GATA-1 in mouse embryonic stem cells - and megakaryocytes. @@ -75,7 +75,7 @@ As for any NGS data analysis, ChIP-seq data must be quality controlled before be > > 2. What is the main difference between a FASTQ and a FASTA file? > {: .question} > -> 4. **FastQC** :wrench:: Run the tool **FastQC** on each FASTQ file to assess the quality of the raw data. An explanation of the results can be found on the [FastQC web page](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). +> 4. **FastQC** :wrench:: Run the tool **FastQC** on each FASTQ file to assess the quality of the raw data. An explanation of the results can be found on the [FastQC web page](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). > > > ### :bulb: Tip: Running a tool on multiple data files > > @@ -162,7 +162,7 @@ Nowadays, there are many read alignment programs for sequenced DNA, `BWA` being To assess the similarity between the replicates sequencing datasets, it is a common technique to calculate the correlation of read counts for the different samples. -We expect that the replicates of the ChIP-seq experiments should be clustered more closely to each other than the replicates of the input samples. We will be use tools from the package **deepTools** for the next few steps. More information on **deepTools** can be found [here](http://deeptools.readthedocs.io/en/latest/content/list_of_tools.html). +We expect that the replicates of the ChIP-seq experiments should be clustered more closely to each other than the replicates of the input samples. We will be use tools from the package **deepTools** for the next few steps. More information on **deepTools** can be found [here](https://deeptools.readthedocs.io/en/latest/content/list_of_tools.html). > ### :pencil2: Hands-on: Assessing correlation between samples > @@ -202,7 +202,7 @@ We expect that the replicates of the ChIP-seq experiments should be clustered mo >

    Figure 8: Heatmap of correlation matrix generated by **plotCorrelation**.
    {: .hands_on} -For additional informaton on how to interpret **plotCorrelation** plots, read the information [here](http://deeptools.readthedocs.io/en/latest/content/tools/plotCorrelation.html#background) +For additional informaton on how to interpret **plotCorrelation** plots, read the information [here](https://deeptools.readthedocs.io/en/latest/content/tools/plotCorrelation.html#background) # Step 5: Assessing IP strength @@ -235,7 +235,7 @@ We will now evaluate the quality of the immuno-precipitation step in the ChIP-se > {: .question} {: .hands_on} -For additional informaton on how to interpret **plotFingerprint** plots, read the information [here](http://deeptools.readthedocs.io/en/latest/content/tools/plotFingerprint.html#background) +For additional informaton on how to interpret **plotFingerprint** plots, read the information [here](https://deeptools.readthedocs.io/en/latest/content/tools/plotFingerprint.html#background) # Step 6: Generating Input normalized coverage files @@ -259,7 +259,7 @@ Now that **BWA** has aligned the reads to the genome, we will use the tool **MAC 1. Identify regions of Tal1 occupancy (peaks) 2. Generate bedgraph files for visual inspection of the data on a genome browser. -More information about **MACS2** can be found [here](http://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137). +More information about **MACS2** can be found [here](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137). > ### :pencil2: Hands-on: Determining Tal1 binding sites > @@ -438,7 +438,7 @@ We will now check whether the samples have more reads from regions of the genome > {: .question} {: .hands_on} -For additional informaton on how to interpret **computeGCbias** plots, read the information [here](http://deeptools.readthedocs.io/en/latest/content/tools/computeGCBias.html#background) +For additional informaton on how to interpret **computeGCbias** plots, read the information [here](https://deeptools.readthedocs.io/en/latest/content/tools/computeGCBias.html#background) # Conclusion diff --git a/topics/dev/files/hands_on-tool_integration/seqtk/tool_test_output.html b/topics/dev/files/hands_on-tool_integration/seqtk/tool_test_output.html index fe5c0e27..6fda6519 100644 --- a/topics/dev/files/hands_on-tool_integration/seqtk/tool_test_output.html +++ b/topics/dev/files/hands_on-tool_integration/seqtk/tool_test_output.html @@ -8,7 +8,7 @@ @@ -196,7 +196,7 @@

    Tests

    },removeAttr:function(a,b){var c,d,e=0,f=b&&b.match(E);if(f&&1===a.nodeType)while(c=f[e++])d=n.propFix[c]||c,n.expr.match.bool.test(c)&&(a[d]=!1),a.removeAttribute(c)},attrHooks:{type:{set:function(a,b){if(!k.radioValue&&"radio"===b&&n.nodeName(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}}}),Zb={set:function(a,b,c){return b===!1?n.removeAttr(a,c):a.setAttribute(c,c),c}},n.each(n.expr.match.bool.source.match(/\w+/g),function(a,b){var c=$b[b]||n.find.attr;$b[b]=function(a,b,d){var e,f;return d||(f=$b[b],$b[b]=e,e=null!=c(a,b,d)?b.toLowerCase():null,$b[b]=f),e}});var _b=/^(?:input|select|textarea|button)$/i;n.fn.extend({prop:function(a,b){return J(this,n.prop,a,b,arguments.length>1)},removeProp:function(a){return this.each(function(){delete this[n.propFix[a]||a]})}}),n.extend({propFix:{"for":"htmlFor","class":"className"},prop:function(a,b,c){var d,e,f,g=a.nodeType;if(a&&3!==g&&8!==g&&2!==g)return f=1!==g||!n.isXMLDoc(a),f&&(b=n.propFix[b]||b,e=n.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){return a.hasAttribute("tabindex")||_b.test(a.nodeName)||a.href?a.tabIndex:-1}}}}),k.optSelected||(n.propHooks.selected={get:function(a){var b=a.parentNode;return b&&b.parentNode&&b.parentNode.selectedIndex,null}}),n.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){n.propFix[this.toLowerCase()]=this});var ac=/[\t\r\n\f]/g;n.fn.extend({addClass:function(a){var b,c,d,e,f,g,h="string"==typeof a&&a,i=0,j=this.length;if(n.isFunction(a))return this.each(function(b){n(this).addClass(a.call(this,b,this.className))});if(h)for(b=(a||"").match(E)||[];j>i;i++)if(c=this[i],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(ac," "):" ")){f=0;while(e=b[f++])d.indexOf(" "+e+" ")<0&&(d+=e+" ");g=n.trim(d),c.className!==g&&(c.className=g)}return this},removeClass:function(a){var b,c,d,e,f,g,h=0===arguments.length||"string"==typeof a&&a,i=0,j=this.length;if(n.isFunction(a))return this.each(function(b){n(this).removeClass(a.call(this,b,this.className))});if(h)for(b=(a||"").match(E)||[];j>i;i++)if(c=this[i],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(ac," "):"")){f=0;while(e=b[f++])while(d.indexOf(" "+e+" ")>=0)d=d.replace(" "+e+" "," ");g=a?n.trim(d):"",c.className!==g&&(c.className=g)}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):this.each(n.isFunction(a)?function(c){n(this).toggleClass(a.call(this,c,this.className,b),b)}:function(){if("string"===c){var b,d=0,e=n(this),f=a.match(E)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else(c===U||"boolean"===c)&&(this.className&&L.set(this,"__className__",this.className),this.className=this.className||a===!1?"":L.get(this,"__className__")||"")})},hasClass:function(a){for(var b=" "+a+" ",c=0,d=this.length;d>c;c++)if(1===this[c].nodeType&&(" "+this[c].className+" ").replace(ac," ").indexOf(b)>=0)return!0;return!1}});var bc=/\r/g;n.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=n.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,n(this).val()):a,null==e?e="":"number"==typeof e?e+="":n.isArray(e)&&(e=n.map(e,function(a){return null==a?"":a+""})),b=n.valHooks[this.type]||n.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=n.valHooks[e.type]||n.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(bc,""):null==c?"":c)}}}),n.extend({valHooks:{option:{get:function(a){var b=n.find.attr(a,"value");return null!=b?b:n.trim(n.text(a))}},select:{get:function(a){for(var b,c,d=a.options,e=a.selectedIndex,f="select-one"===a.type||0>e,g=f?null:[],h=f?e+1:d.length,i=0>e?h:f?e:0;h>i;i++)if(c=d[i],!(!c.selected&&i!==e||(k.optDisabled?c.disabled:null!==c.getAttribute("disabled"))||c.parentNode.disabled&&n.nodeName(c.parentNode,"optgroup"))){if(b=n(c).val(),f)return b;g.push(b)}return g},set:function(a,b){var c,d,e=a.options,f=n.makeArray(b),g=e.length;while(g--)d=e[g],(d.selected=n.inArray(d.value,f)>=0)&&(c=!0);return c||(a.selectedIndex=-1),f}}}}),n.each(["radio","checkbox"],function(){n.valHooks[this]={set:function(a,b){return n.isArray(b)?a.checked=n.inArray(n(a).val(),b)>=0:void 0}},k.checkOn||(n.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})}),n.each("blur focus focusin focusout load resize scroll unload click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup error contextmenu".split(" "),function(a,b){n.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),n.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)},bind:function(a,b,c){return this.on(a,null,b,c)},unbind:function(a,b){return this.off(a,null,b)},delegate:function(a,b,c,d){return this.on(b,a,c,d)},undelegate:function(a,b,c){return 1===arguments.length?this.off(a,"**"):this.off(b,a||"**",c)}});var cc=n.now(),dc=/\?/;n.parseJSON=function(a){return JSON.parse(a+"")},n.parseXML=function(a){var b,c;if(!a||"string"!=typeof a)return null;try{c=new DOMParser,b=c.parseFromString(a,"text/xml")}catch(d){b=void 0}return(!b||b.getElementsByTagName("parsererror").length)&&n.error("Invalid XML: "+a),b};var ec,fc,gc=/#.*$/,hc=/([?&])_=[^&]*/,ic=/^(.*?):[ \t]*([^\r\n]*)$/gm,jc=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,kc=/^(?:GET|HEAD)$/,lc=/^\/\//,mc=/^([\w.+-]+:)(?:\/\/(?:[^\/?#]*@|)([^\/?#:]*)(?::(\d+)|)|)/,nc={},oc={},pc="*/".concat("*");try{fc=location.href}catch(qc){fc=l.createElement("a"),fc.href="",fc=fc.href}ec=mc.exec(fc.toLowerCase())||[];function rc(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(E)||[];if(n.isFunction(c))while(d=f[e++])"+"===d[0]?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function sc(a,b,c,d){var e={},f=a===oc;function g(h){var i;return e[h]=!0,n.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function tc(a,b){var c,d,e=n.ajaxSettings.flatOptions||{};for(c in b)void 0!==b[c]&&((e[c]?a:d||(d={}))[c]=b[c]);return d&&n.extend(!0,a,d),a}function uc(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===d&&(d=a.mimeType||b.getResponseHeader("Content-Type"));if(d)for(e in h)if(h[e]&&h[e].test(d)){i.unshift(e);break}if(i[0]in c)f=i[0];else{for(e in c){if(!i[0]||a.converters[e+" "+i[0]]){f=e;break}g||(g=e)}f=f||g}return f?(f!==i[0]&&i.unshift(f),c[f]):void 0}function vc(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}n.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:fc,type:"GET",isLocal:jc.test(ec[1]),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":pc,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/xml/,html:/html/,json:/json/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":n.parseJSON,"text xml":n.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?tc(tc(a,n.ajaxSettings),b):tc(n.ajaxSettings,a)},ajaxPrefilter:rc(nc),ajaxTransport:rc(oc),ajax:function(a,b){"object"==typeof a&&(b=a,a=void 0),b=b||{};var c,d,e,f,g,h,i,j,k=n.ajaxSetup({},b),l=k.context||k,m=k.context&&(l.nodeType||l.jquery)?n(l):n.event,o=n.Deferred(),p=n.Callbacks("once memory"),q=k.statusCode||{},r={},s={},t=0,u="canceled",v={readyState:0,getResponseHeader:function(a){var b;if(2===t){if(!f){f={};while(b=ic.exec(e))f[b[1].toLowerCase()]=b[2]}b=f[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return 2===t?e:null},setRequestHeader:function(a,b){var c=a.toLowerCase();return t||(a=s[c]=s[c]||a,r[a]=b),this},overrideMimeType:function(a){return t||(k.mimeType=a),this},statusCode:function(a){var b;if(a)if(2>t)for(b in a)q[b]=[q[b],a[b]];else v.always(a[v.status]);return this},abort:function(a){var b=a||u;return c&&c.abort(b),x(0,b),this}};if(o.promise(v).complete=p.add,v.success=v.done,v.error=v.fail,k.url=((a||k.url||fc)+"").replace(gc,"").replace(lc,ec[1]+"//"),k.type=b.method||b.type||k.method||k.type,k.dataTypes=n.trim(k.dataType||"*").toLowerCase().match(E)||[""],null==k.crossDomain&&(h=mc.exec(k.url.toLowerCase()),k.crossDomain=!(!h||h[1]===ec[1]&&h[2]===ec[2]&&(h[3]||("http:"===h[1]?"80":"443"))===(ec[3]||("http:"===ec[1]?"80":"443")))),k.data&&k.processData&&"string"!=typeof k.data&&(k.data=n.param(k.data,k.traditional)),sc(nc,k,b,v),2===t)return v;i=k.global,i&&0===n.active++&&n.event.trigger("ajaxStart"),k.type=k.type.toUpperCase(),k.hasContent=!kc.test(k.type),d=k.url,k.hasContent||(k.data&&(d=k.url+=(dc.test(d)?"&":"?")+k.data,delete k.data),k.cache===!1&&(k.url=hc.test(d)?d.replace(hc,"$1_="+cc++):d+(dc.test(d)?"&":"?")+"_="+cc++)),k.ifModified&&(n.lastModified[d]&&v.setRequestHeader("If-Modified-Since",n.lastModified[d]),n.etag[d]&&v.setRequestHeader("If-None-Match",n.etag[d])),(k.data&&k.hasContent&&k.contentType!==!1||b.contentType)&&v.setRequestHeader("Content-Type",k.contentType),v.setRequestHeader("Accept",k.dataTypes[0]&&k.accepts[k.dataTypes[0]]?k.accepts[k.dataTypes[0]]+("*"!==k.dataTypes[0]?", "+pc+"; q=0.01":""):k.accepts["*"]);for(j in k.headers)v.setRequestHeader(j,k.headers[j]);if(k.beforeSend&&(k.beforeSend.call(l,v,k)===!1||2===t))return v.abort();u="abort";for(j in{success:1,error:1,complete:1})v[j](k[j]);if(c=sc(oc,k,b,v)){v.readyState=1,i&&m.trigger("ajaxSend",[v,k]),k.async&&k.timeout>0&&(g=setTimeout(function(){v.abort("timeout")},k.timeout));try{t=1,c.send(r,x)}catch(w){if(!(2>t))throw w;x(-1,w)}}else x(-1,"No Transport");function x(a,b,f,h){var j,r,s,u,w,x=b;2!==t&&(t=2,g&&clearTimeout(g),c=void 0,e=h||"",v.readyState=a>0?4:0,j=a>=200&&300>a||304===a,f&&(u=uc(k,v,f)),u=vc(k,u,v,j),j?(k.ifModified&&(w=v.getResponseHeader("Last-Modified"),w&&(n.lastModified[d]=w),w=v.getResponseHeader("etag"),w&&(n.etag[d]=w)),204===a||"HEAD"===k.type?x="nocontent":304===a?x="notmodified":(x=u.state,r=u.data,s=u.error,j=!s)):(s=x,(a||!x)&&(x="error",0>a&&(a=0))),v.status=a,v.statusText=(b||x)+"",j?o.resolveWith(l,[r,x,v]):o.rejectWith(l,[v,x,s]),v.statusCode(q),q=void 0,i&&m.trigger(j?"ajaxSuccess":"ajaxError",[v,k,j?r:s]),p.fireWith(l,[v,x]),i&&(m.trigger("ajaxComplete",[v,k]),--n.active||n.event.trigger("ajaxStop")))}return v},getJSON:function(a,b,c){return n.get(a,b,c,"json")},getScript:function(a,b){return n.get(a,void 0,b,"script")}}),n.each(["get","post"],function(a,b){n[b]=function(a,c,d,e){return n.isFunction(c)&&(e=e||d,d=c,c=void 0),n.ajax({url:a,type:b,dataType:e,data:c,success:d})}}),n.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(a,b){n.fn[b]=function(a){return this.on(b,a)}}),n._evalUrl=function(a){return n.ajax({url:a,type:"GET",dataType:"script",async:!1,global:!1,"throws":!0})},n.fn.extend({wrapAll:function(a){var b;return n.isFunction(a)?this.each(function(b){n(this).wrapAll(a.call(this,b))}):(this[0]&&(b=n(a,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstElementChild)a=a.firstElementChild;return a}).append(this)),this)},wrapInner:function(a){return this.each(n.isFunction(a)?function(b){n(this).wrapInner(a.call(this,b))}:function(){var b=n(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=n.isFunction(a);return this.each(function(c){n(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(){return this.parent().each(function(){n.nodeName(this,"body")||n(this).replaceWith(this.childNodes)}).end()}}),n.expr.filters.hidden=function(a){return a.offsetWidth<=0&&a.offsetHeight<=0},n.expr.filters.visible=function(a){return!n.expr.filters.hidden(a)};var wc=/%20/g,xc=/\[\]$/,yc=/\r?\n/g,zc=/^(?:submit|button|image|reset|file)$/i,Ac=/^(?:input|select|textarea|keygen)/i;function Bc(a,b,c,d){var e;if(n.isArray(b))n.each(b,function(b,e){c||xc.test(a)?d(a,e):Bc(a+"["+("object"==typeof e?b:"")+"]",e,c,d)});else if(c||"object"!==n.type(b))d(a,b);else for(e in b)Bc(a+"["+e+"]",b[e],c,d)}n.param=function(a,b){var c,d=[],e=function(a,b){b=n.isFunction(b)?b():null==b?"":b,d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(b)};if(void 0===b&&(b=n.ajaxSettings&&n.ajaxSettings.traditional),n.isArray(a)||a.jquery&&!n.isPlainObject(a))n.each(a,function(){e(this.name,this.value)});else for(c in a)Bc(c,a[c],b,e);return d.join("&").replace(wc,"+")},n.fn.extend({serialize:function(){return n.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=n.prop(this,"elements");return a?n.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!n(this).is(":disabled")&&Ac.test(this.nodeName)&&!zc.test(a)&&(this.checked||!T.test(a))}).map(function(a,b){var c=n(this).val();return null==c?null:n.isArray(c)?n.map(c,function(a){return{name:b.name,value:a.replace(yc,"\r\n")}}):{name:b.name,value:c.replace(yc,"\r\n")}}).get()}}),n.ajaxSettings.xhr=function(){try{return new XMLHttpRequest}catch(a){}};var Cc=0,Dc={},Ec={0:200,1223:204},Fc=n.ajaxSettings.xhr();a.ActiveXObject&&n(a).on("unload",function(){for(var a in Dc)Dc[a]()}),k.cors=!!Fc&&"withCredentials"in Fc,k.ajax=Fc=!!Fc,n.ajaxTransport(function(a){var b;return k.cors||Fc&&!a.crossDomain?{send:function(c,d){var e,f=a.xhr(),g=++Cc;if(f.open(a.type,a.url,a.async,a.username,a.password),a.xhrFields)for(e in a.xhrFields)f[e]=a.xhrFields[e];a.mimeType&&f.overrideMimeType&&f.overrideMimeType(a.mimeType),a.crossDomain||c["X-Requested-With"]||(c["X-Requested-With"]="XMLHttpRequest");for(e in c)f.setRequestHeader(e,c[e]);b=function(a){return function(){b&&(delete Dc[g],b=f.onload=f.onerror=null,"abort"===a?f.abort():"error"===a?d(f.status,f.statusText):d(Ec[f.status]||f.status,f.statusText,"string"==typeof f.responseText?{text:f.responseText}:void 0,f.getAllResponseHeaders()))}},f.onload=b(),f.onerror=b("error"),b=Dc[g]=b("abort");try{f.send(a.hasContent&&a.data||null)}catch(h){if(b)throw h}},abort:function(){b&&b()}}:void 0}),n.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/(?:java|ecma)script/},converters:{"text script":function(a){return n.globalEval(a),a}}}),n.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET")}),n.ajaxTransport("script",function(a){if(a.crossDomain){var b,c;return{send:function(d,e){b=n(" -Today we hear a lot about personalized medicine. Yet the *personalization* is defined by the genetic make up of the individual. Today we will discuss how this information can be uncovered from the genomic sequencing data. The figure above shows distribution of rare and common variants in 1,092 human genomes described by the [1000 Genome Consortium](http://www.nature.com/nature/journal/v491/n7422/abs/nature11632.html). +Today we hear a lot about personalized medicine. Yet the *personalization* is defined by the genetic make up of the individual. Today we will discuss how this information can be uncovered from the genomic sequencing data. The figure above shows distribution of rare and common variants in 1,092 human genomes described by the [1000 Genome Consortium](https://www.nature.com/nature/journal/v491/n7422/abs/nature11632.html). # Calling variants -Variant calling is a complex field that was significantly propelled by advances in DNA sequencing and efforts of large scientific consortia such as the [1000 Genomes](http://www.1000genomes.org). Here we summarize basic ideas central to Genotype and Variant calling. First, let's contrast the two things although they often go together: +Variant calling is a complex field that was significantly propelled by advances in DNA sequencing and efforts of large scientific consortia such as the [1000 Genomes](https://www.1000genomes.org). Here we summarize basic ideas central to Genotype and Variant calling. First, let's contrast the two things although they often go together: * **Variant calling** - identification of positions where the sequenced sample is different from the reference sequence (or [reference genome graph](https://github.com/vgteam/vg)); * **Genotype calling** - identifying individual's genotype at variable sites. @@ -55,7 +55,7 @@ A typical workflow for variation discovery involves the following steps (e.g., s 6. Performing filtering and genotype quality score recalibration 7. Annotating variants and performing downstream analyses -However, continuing evolution of variant detection methods has made some of these steps obsolete. For instance, omitting quality score recalibration and re-alignment (steps 3 and 4 above) when using haplotype-aware variant callers such as [FreeBayes](https://github.com/ekg/freebayes) does not have an effect on the resulting calls (see Brad Chapman's methodological comparisons at [bcbio](http://bit.ly/1S9kFJN)). Before going forward with an actual genotype calling in Galaxy let's take a look as some basic ideas behind modern variant callers. +However, continuing evolution of variant detection methods has made some of these steps obsolete. For instance, omitting quality score recalibration and re-alignment (steps 3 and 4 above) when using haplotype-aware variant callers such as [FreeBayes](https://github.com/ekg/freebayes) does not have an effect on the resulting calls (see Brad Chapman's methodological comparisons at [bcbio](https://bit.ly/1S9kFJN)). Before going forward with an actual genotype calling in Galaxy let's take a look as some basic ideas behind modern variant callers. ## How does SNP calling and genotyping work? @@ -128,7 +128,7 @@ In the simplest case we can estimate these as follows: Suppose $S_i$ is a base in read $i$ corresponding to a genome position with genotype $G$. The probability of seeing $S_i$ given $G$, or $P(S_i|G)$, is given by the quality score of $S_i$ (the quality scores are given by base calling software and reported as [phred scores](https://en.wikipedia.org/wiki/Phred_quality_score)). Thus the genotype likelihood $P(S|G)$ is the product of $P(S_i|G)$ over all $i$. In reality however there are many other sources of uncertainty (in addition to base qualities) that are incorporated in the calculation of data likelihoods including NGS technology-related issues, dependency of error rates on substitution type (e.g., transitions versus transversions), sequencing context etc... ### $P(G)$ - a single sample case -One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. +One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](https://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. ### $P(G)$ - a multi-sample case Genotype calling reliability can be significantly improved when analyzing multiple samples jointly. In this case genotype frequencies can be inferred from allele frequencies using Hardy-Weinberg equilibrium ([HWE](https://en.wikipedia.org/wiki/Hardy%E2%80%93Weinberg_principle)). The following example (again from [Nielsen et al. 2011](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/)) illustrates this idea: suppose you are calling genotypes for a single individual using a combination of multiple samples. There are two genotypes, **AT** and **AA**, with equally large genotype likelihoods. If, however, in our collection of multiple samples the frequency of **A** is 1% ($p = 0.01$; $q = 1 - p = 0.99$), then from the HWE we have: @@ -149,7 +149,7 @@ This makes it highly unlikely that **AA** is a true genotype of this individual. * **Variant quality recalibration is avoided** by incorporating a number of metrics, such as read placement bias and allele balance, directly into the Bayesian model; * **Ability to incorporate non-diploid cases** such as pooled datasets or data from polyploid samples. -Freebayes is a *haplotype-based* variant caller. This implies that instead of looking at an individual positions within an alignment of reads to the reference genome, it looks at a haplotype window, length of which is dynamically determined (see section 3.2. in [FreeBayes manuscript](http://arxiv.org/pdf/1207.3907v2.pdf)): +Freebayes is a *haplotype-based* variant caller. This implies that instead of looking at an individual positions within an alignment of reads to the reference genome, it looks at a haplotype window, length of which is dynamically determined (see section 3.2. in [FreeBayes manuscript](https://arxiv.org/pdf/1207.3907v2.pdf)): | | @@ -161,13 +161,13 @@ Freebayes is a *haplotype-based* variant caller. This implies that instead of lo ## The data -In this example we will perform variant calling and annotation using [genome in the bottle data](http://jimb.stanford.edu/giab/). Specifically, we will use Ashkenazim Father-Mother-Son trio data from the Personal Genome Project: +In this example we will perform variant calling and annotation using [genome in the bottle data](https://jimb.stanford.edu/giab/). Specifically, we will use Ashkenazim Father-Mother-Son trio data from the Personal Genome Project: * HG002 - NA24385 - huAA53E0 (son) * HG003 - NA24149 - hu6E4515 (father) * HG004 - NA24143 - hu8E87A9 (mother) -Yet for a quick tutorial these datasets are way too big, so we created a downsampled (watered down) dataset. This dataset was produced by mapping the trio reads against the `hg19` version of the human genome, merging the resulting bam files together (we use readgroups to label individual reads so they can be traced to each of the original individuals), and restricting alignments to a small portion of chromosome 19 containing the [*POLRMT*](http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=5442) gene. +Yet for a quick tutorial these datasets are way too big, so we created a downsampled (watered down) dataset. This dataset was produced by mapping the trio reads against the `hg19` version of the human genome, merging the resulting bam files together (we use readgroups to label individual reads so they can be traced to each of the original individuals), and restricting alignments to a small portion of chromosome 19 containing the [*POLRMT*](https://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=5442) gene. Here is what to do to load the data: @@ -200,7 +200,7 @@ Here is what to do to load the data: > >![](../../images/freebayes_gq.png) > ->Depending on how busy Galaxy is this may take a little bit of time (coffee break?). Eventially this will produce a dataset in [VCF](http://www.1000genomes.org/wiki/Analysis/variant-call-format) format containing 35 putative variants. Before we can continue we need to post-process this dataset by breaking compound variants into multiple independent variants with **VcfAllelicPrimitives** tool found within **NGS: VCF Manipulation** section. This is necessary for ensuring the smooth sailing through downstream analyses: +>Depending on how busy Galaxy is this may take a little bit of time (coffee break?). Eventially this will produce a dataset in [VCF](https://www.1000genomes.org/wiki/Analysis/variant-call-format) format containing 35 putative variants. Before we can continue we need to post-process this dataset by breaking compound variants into multiple independent variants with **VcfAllelicPrimitives** tool found within **NGS: VCF Manipulation** section. This is necessary for ensuring the smooth sailing through downstream analyses: > {: .hands_on} @@ -229,7 +229,7 @@ chr19 618854 . G A 81.7546 ### Annotating variants with SnpEff -At this point we are ready to begin annotating variants using [SnpEff](http://snpeff.sourceforge.net/SnpEff.html). SnpEff, a project maintained by [Pablo Cingolani](https://www.linkedin.com/in/pablocingolani) "*...annotates and predicts the effects of variants on genes (such as amino acid changes)...*" and so is critical for functional interpretation of variation data. +At this point we are ready to begin annotating variants using [SnpEff](https://snpeff.sourceforge.net/SnpEff.html). SnpEff, a project maintained by [Pablo Cingolani](https://www.linkedin.com/in/pablocingolani) "*...annotates and predicts the effects of variants on genes (such as amino acid changes)...*" and so is critical for functional interpretation of variation data. >### Running SNPeff > @@ -249,7 +249,7 @@ At this point we are ready to begin annotating variants using [SnpEff](http://sn ## Manipulating variation data with GEMINI -Now that we have an annotated VCF file it is time to peek inside our variation data. [Aaron Quinlan](http://quinlanlab.org/), creator of [GEMINI](http://gemini.readthedocs.org/en/latest/index.html), calls it *Detective work*. +Now that we have an annotated VCF file it is time to peek inside our variation data. [Aaron Quinlan](https://quinlanlab.org/), creator of [GEMINI](http://gemini.readthedocs.org/en/latest/index.html), calls it *Detective work*. ### Loading data into GEMINI @@ -279,9 +279,9 @@ The first step is to convert a VCF file we would like to analyze into a GEMINI d ### Querying GEMINI database -GEMINI database is queried using the versatile SQL language (more on SQL [here](http://swcarpentry.github.io/sql-novice-survey)). In Galaxy's version of GEMINI this is done using **GEMINI_query** tool. Within this tool SQL commands are typed directly into the **The query to be issued to the database** text box. Let's begin getting information from some of the tables we discovered with **GEMINI_db_info** tool above. +GEMINI database is queried using the versatile SQL language (more on SQL [here](https://swcarpentry.github.io/sql-novice-survey)). In Galaxy's version of GEMINI this is done using **GEMINI_query** tool. Within this tool SQL commands are typed directly into the **The query to be issued to the database** text box. Let's begin getting information from some of the tables we discovered with **GEMINI_db_info** tool above. -The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/gemini-tutorials/Intro-To-Gemini.pdf)" tutorial. For extensive documentation see "[Querying GEMINI](http://gemini.readthedocs.org/en/latest/content/querying.html)". +The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/gemini-tutorials/Intro-To-Gemini.pdf)" tutorial. For extensive documentation see "[Querying GEMINI](https://gemini.readthedocs.org/en/latest/content/querying.html)". > ### Are there "novel" varinats that are not annotated in dbSNP database? > @@ -314,7 +314,7 @@ The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/ge >SELECT rs_ids, aaf_esp_ea, impact, clinvar_disease_name, clinvar_sig FROM variants WHERE filter is NULL and gene = 'POLRMT' >``` > ->(column definitions can be found [here](http://gemini.readthedocs.org/en/latest/content/database_schema.html)) +>(column definitions can be found [here](https://gemini.readthedocs.org/en/latest/content/database_schema.html)) > >[Output](https://usegalaxy.org/datasets/bbd44e69cb8906b540d65297cd1d26bb/display/?preview=True) shows varinats found within the *POLRMT* gene. > @@ -423,7 +423,7 @@ Let's try a few examples. > > * `(gt_types).(*).(==HET).(all)` > ->the [all operator](http://gemini.readthedocs.org/en/latest/content/querying.html#the-all-operator) implies that want results for **all** afftected individuals). Output will look like [this](https://usegalaxy.org/datasets/bbd44e69cb8906b5819e1404b5e127d1/display/?preview=True). +>the [all operator](https://gemini.readthedocs.org/en/latest/content/querying.html#the-all-operator) implies that want results for **all** afftected individuals). Output will look like [this](https://usegalaxy.org/datasets/bbd44e69cb8906b5819e1404b5e127d1/display/?preview=True). > {: .question} diff --git a/topics/usegalaxy/tutorials/dunovo/tutorial.md b/topics/usegalaxy/tutorials/dunovo/tutorial.md index 96a67063..a78156ba 100644 --- a/topics/usegalaxy/tutorials/dunovo/tutorial.md +++ b/topics/usegalaxy/tutorials/dunovo/tutorial.md @@ -4,16 +4,16 @@ topic_name: usegalaxy tutorial_name: dunovo --- -This page explains how to perform discovery of low frequency variants from duplex sequencing data. As an example we use the _ABL1_ dataset published by [Schmitt and colleagues](https://www.ncbi.nlm.nih.gov/pubmed/25849638) (SRA accession [SRR1799908](http://www.ncbi.nlm.nih.gov/sra/?term=SRR1799908)). +This page explains how to perform discovery of low frequency variants from duplex sequencing data. As an example we use the _ABL1_ dataset published by [Schmitt and colleagues](https://www.ncbi.nlm.nih.gov/pubmed/25849638) (SRA accession [SRR1799908](https://www.ncbi.nlm.nih.gov/sra/?term=SRR1799908)). # Background -Calling low frequency variants from next generation sequencing (NGS) data is challenging due to significant amount of noise characteristic of these technologies. [Duplex sequencing](http://www.pnas.org/content/109/36/14508.short) (DS) was designed to address this problem by increasing sequencing accuracy by over four orders of magnitude. DS uses randomly generated barcodes to uniquely tag each molecule in a sample. The tagged fragments are then PCR amplified prior to the preparation of a sequencing library, creating fragment families characterized by unique combination of barcodes at both 5’ and 3’ ends: +Calling low frequency variants from next generation sequencing (NGS) data is challenging due to significant amount of noise characteristic of these technologies. [Duplex sequencing](https://www.pnas.org/content/109/36/14508.short) (DS) was designed to address this problem by increasing sequencing accuracy by over four orders of magnitude. DS uses randomly generated barcodes to uniquely tag each molecule in a sample. The tagged fragments are then PCR amplified prior to the preparation of a sequencing library, creating fragment families characterized by unique combination of barcodes at both 5’ and 3’ ends: ->[![duplex](../../images/ds.png)](http://www.pnas.org/content/109/36/14508/F1.expansion.html) +>[![duplex](../../images/ds.png)](https://www.pnas.org/content/109/36/14508/F1.expansion.html) > ->The logic of duplex sequencing. From [Schmitt:2012](http://www.pnas.org/content/109/36/14508.short). +>The logic of duplex sequencing. From [Schmitt:2012](https://www.pnas.org/content/109/36/14508.short). The computational analysis of DS data (Part `C` in the figure above) produces two kinds of output: @@ -32,7 +32,7 @@ In the image above there are two alleles: green (A) and red (G). After PCR a fra The entire analysis described here is accessible as a [Galaxy history](https://usegalaxy.org/u/aun1/h/duplex-analysis-abl1) (by clicking on this link you can create your own copy and play with it). ->![History Item](http://galaxyproject.org/duplex/histItem.png) +>![History Item](https://galaxyproject.org/duplex/histItem.png) > >Each history item has a Rerun ![refresh](https://galaxyproject.org/tutorials/g101/fa-refresh.png) button. Clicking this button will show you how this tool was run with all parameters filled in exactly. @@ -165,7 +165,7 @@ bwa-mem | 130,880,141 | A | G | 0.479 | We can see that results of both mappers agree very well. The reason we see these numbers grouped by mappers is because we have set the readgroups while [mapping](#align-against-genome-with-bwa-and-bwa-mem). -The polymorphism we are interested in (and the one reported by [Schmitt:2015] (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4414912/)) is at the position 130,872,141 and has a frequency of 1.3%. The other site (position 130,880,141) is a known common variant [rs2227985](http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs2227985), which is heterozygous in this sample. +The polymorphism we are interested in (and the one reported by [Schmitt:2015] (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4414912/)) is at the position 130,872,141 and has a frequency of 1.3%. The other site (position 130,880,141) is a known common variant [rs2227985](https://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?type=rs&rs=rs2227985), which is heterozygous in this sample. # Analysis of single strand consensus data diff --git a/topics/usegalaxy/tutorials/ngs/tutorial.md b/topics/usegalaxy/tutorials/ngs/tutorial.md index 38b528e7..08ee33d0 100644 --- a/topics/usegalaxy/tutorials/ngs/tutorial.md +++ b/topics/usegalaxy/tutorials/ngs/tutorial.md @@ -42,10 +42,10 @@ Finally, datasets can be uploaded directly from NCBI's short read archive: ### Try it yourself -- Create a new Galaxy history at http://usegalaxy.org (don't forget to log in). +- Create a new Galaxy history at https://usegalaxy.org (don't forget to log in). - Import the following two datasets (for help see the above video): - - [A set of Forward reads](http://www.bx.psu.edu/~anton/share/ng_test_data/var/raw_mother-ds-1.fq.gz) - - [A set of Reverse reads](http://www.bx.psu.edu/~anton/share/ng_test_data/var/raw_mother-ds-2.fq.gz) + - [A set of Forward reads](https://www.bx.psu.edu/~anton/share/ng_test_data/var/raw_mother-ds-1.fq.gz) + - [A set of Reverse reads](https://www.bx.psu.edu/~anton/share/ng_test_data/var/raw_mother-ds-2.fq.gz) These are paired end data (see below for explanation of what paired-end is) for a single Illumina run. Keep Galaxy history for later. We will need it again in a few minutes. @@ -53,7 +53,7 @@ These are paired end data (see below for explanation of what paired-end is) for ## What is Fastq? -[FastQ](http://en.wikipedia.org/wiki/FASTQ_format) is not a very well defined format. In the beginning various manufacturers of sequencing instruments were free to interpret fastq as they saw fit, resulting in a multitude of fastq flavors. This variation stemmed primarily from different ways of encoding quality values as described [here](http://en.wikipedia.org/wiki/FASTQ_format) (below you will explanation of quality scores and their meaning). Today, [fastq Sanger](http://www.ncbi.nlm.nih.gov/pubmed/20015970) version of the format is considered to be the standard form of fastq. Galaxy is using fastq sanger as the only legitimate input for downstream processing tools and provides [a number of utilities for converting fastq files](http://www.ncbi.nlm.nih.gov/pubmed/20562416) into this form (see **NGS: QC and manipulation** section of Galaxy tools). +[FastQ](https://en.wikipedia.org/wiki/FASTQ_format) is not a very well defined format. In the beginning various manufacturers of sequencing instruments were free to interpret fastq as they saw fit, resulting in a multitude of fastq flavors. This variation stemmed primarily from different ways of encoding quality values as described [here](http://en.wikipedia.org/wiki/FASTQ_format) (below you will explanation of quality scores and their meaning). Today, [fastq Sanger](http://www.ncbi.nlm.nih.gov/pubmed/20015970) version of the format is considered to be the standard form of fastq. Galaxy is using fastq sanger as the only legitimate input for downstream processing tools and provides [a number of utilities for converting fastq files](http://www.ncbi.nlm.nih.gov/pubmed/20562416) into this form (see **NGS: QC and manipulation** section of Galaxy tools). Fastq format looks like this: @@ -168,7 +168,7 @@ The base qualities allow us to judge how trustworthy each base in a sequencing r
    -Illumina sequencing is based on identifying the individual nucleotides by the fluorescence signal emitted upon their incorporation into the growing sequencing read. Once the fluorescence intensities are extracted and translated into the four letter code. The deduction of nucleotide sequences from the images acquired during sequencing is commonly referred to as base calling. Due to the imperfect nature of the sequencing process and limitations of the optical instruments, base calling will always have inherent uncertainty. This is the reason why FASTQ files store the DNA sequence of each read together with a position-specific quality score that represents the error probability, i.e., how likely it is that an individual base call may be incorrect. The score is called [Phred score](http://www.phrap.com/phred/), $Q$, which is proportional to the probability $p$ that a base call is incorrect, where $Q = −10lg(p)$. For example, a Phred score of 10 corresponds to one error in every ten base calls ($Q = −10lg(0.1)$), or 90% accuracy; a Phred score of 20 corresponds to one error in every 100 base calls, or 99% accuracy. A higher Phred score thus reflects higher confidence in the reported base. To assign each base a unique score identifier (instead of numbers of varying character length), Phred scores are typically represented as ASCII characters. At http://ascii-code.com/ you can see which characters are assigned to what number. For raw reads, the range of scores will depend on the sequencing technology and the base caller used (Illumina, for example, used a tool called Bustard, or, more recently, RTA). Unfortunately, Illumina has been anything but consistent in how they calculated and ASCII-encoded the Phred score (see below)! In addition, Illumina now allows Phred scores for base calls with as high as 45, while 41 used to be the maximum score until the HiSeq X. This may cause issues with downstream sapplications that expect an upper limit of 41. +Illumina sequencing is based on identifying the individual nucleotides by the fluorescence signal emitted upon their incorporation into the growing sequencing read. Once the fluorescence intensities are extracted and translated into the four letter code. The deduction of nucleotide sequences from the images acquired during sequencing is commonly referred to as base calling. Due to the imperfect nature of the sequencing process and limitations of the optical instruments, base calling will always have inherent uncertainty. This is the reason why FASTQ files store the DNA sequence of each read together with a position-specific quality score that represents the error probability, i.e., how likely it is that an individual base call may be incorrect. The score is called [Phred score](https://www.phrap.com/phred/), $Q$, which is proportional to the probability $p$ that a base call is incorrect, where $Q = −10lg(p)$. For example, a Phred score of 10 corresponds to one error in every ten base calls ($Q = −10lg(0.1)$), or 90% accuracy; a Phred score of 20 corresponds to one error in every 100 base calls, or 99% accuracy. A higher Phred score thus reflects higher confidence in the reported base. To assign each base a unique score identifier (instead of numbers of varying character length), Phred scores are typically represented as ASCII characters. At http://ascii-code.com/ you can see which characters are assigned to what number. For raw reads, the range of scores will depend on the sequencing technology and the base caller used (Illumina, for example, used a tool called Bustard, or, more recently, RTA). Unfortunately, Illumina has been anything but consistent in how they calculated and ASCII-encoded the Phred score (see below)! In addition, Illumina now allows Phred scores for base calls with as high as 45, while 41 used to be the maximum score until the HiSeq X. This may cause issues with downstream sapplications that expect an upper limit of 41. ![](../../images/illumina_qs.png) @@ -185,14 +185,14 @@ Sanger/Phred format that is also used by other sequencing platforms and the sequ ## Assessing data quality -One of the first steps in the analysis of NGS data is seeing how good the data actually is. [FastqQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is a fantastic tool allowing you to gauge the quality of fastq datasets (and deciding whether to blame or not to blame whoever has done sequencing for you). +One of the first steps in the analysis of NGS data is seeing how good the data actually is. [FastqQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) is a fantastic tool allowing you to gauge the quality of fastq datasets (and deciding whether to blame or not to blame whoever has done sequencing for you). | | | |:---------------------------------------|:-----------------------------------| | ![](../../images/good_fq.png) | ![](../../images/bad_fq.png) | |**A.** Excellent quality | **B.** Hmmm...OK | -Here you can see FastQC base quality reports (the tools gives you many other types of data) for two datasets: **A** and **B**. The **A** dataset has long reads (250 bp) and very good quality profile with no qualities dropping below [phred score](http://www.phrap.com/phred/) of 30. The **B** dataset is significantly worse with ends of the reads dipping below phred score of 20. The **B** reads may need to be trimmed for further processing. +Here you can see FastQC base quality reports (the tools gives you many other types of data) for two datasets: **A** and **B**. The **A** dataset has long reads (250 bp) and very good quality profile with no qualities dropping below [phred score](https://www.phrap.com/phred/) of 30. The **B** dataset is significantly worse with ends of the reads dipping below phred score of 20. The **B** reads may need to be trimmed for further processing.
    @@ -204,11 +204,11 @@ QC datasets you have uploaded before. Mapping of NGS reads against reference sequences is one of the key steps of the analysis. Now it is time to see how this is done in practice. Below is a list of key publications highlighting mainstream mapping tools: -- 2009 Bowtie 1 - [Langmead et al.](http://genomebiology.com/content/10/3/R25) +- 2009 Bowtie 1 - [Langmead et al.](https://genomebiology.com/content/10/3/R25) - 2012 Bowtie 2 - [Langmead and Salzberg](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3322381/) -- 2009 BWA - [Li and Durbin](http://bioinformatics.oxfordjournals.org/content/25/14/1754.long) -- 2010 BWA - [Li and Durbin](http://bioinformatics.oxfordjournals.org/content/26/5/589) -- 2013 BWA-MEM - [Li](http://arxiv.org/abs/1303.3997) +- 2009 BWA - [Li and Durbin](https://bioinformatics.oxfordjournals.org/content/25/14/1754.long) +- 2010 BWA - [Li and Durbin](https://bioinformatics.oxfordjournals.org/content/26/5/589) +- 2013 BWA-MEM - [Li](https://arxiv.org/abs/1303.3997) ## Mapping against a pre-computed genome index @@ -369,7 +369,7 @@ Thus, for example, we can use the NM:i:0 tag to select only those reads which ma One of the key features of SAM/BAM format is the ability to label individual reads with readgroup tags. This allows pooling results of multiple experiments into a single BAM dataset. This significantly simplifies downstream logistics: instead of dealing with multiple datasets one can handle just one. Many downstream analysis tools such as variant callers are designed to recognize readgroup data and output results on per-readgroup basis. -One of the best descriptions of BAM readgroups is on [GATK support site](http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files). We have gratefully stolen two tables describing the most important readgroup tags - `ID`, `SM`, `LB`, and `PL` - from GATK forum and provide them here: +One of the best descriptions of BAM readgroups is on [GATK support site](https://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files). We have gratefully stolen two tables describing the most important readgroup tags - `ID`, `SM`, `LB`, and `PL` - from GATK forum and provide them here: ![](../../images/rg.png) @@ -386,9 +386,9 @@ To see an example of read group manipulation in Galaxy see the following video: We support four major toolsets for processing of SAM/BAM datasets: * [DeepTools](https://deeptools.github.io/) - a suite of user-friendly tools for the visualization, quality control and normalization of data from deep-sequencing DNA sequencing experiments. - * [SAMtools](http://www.htslib.org/) - various utilities for manipulating alignments in the SAM/BAM format, including sorting, merging, indexing and generating alignments in a per-position format. + * [SAMtools](https://www.htslib.org/) - various utilities for manipulating alignments in the SAM/BAM format, including sorting, merging, indexing and generating alignments in a per-position format. * [BAMtools](https://github.com/pezmaster31/bamtools/wiki/Tutorial_Toolkit_BamTools-1.0.pdf) - a toolkit for reading, writing, and manipulating BAM (genome alignment) files. - * [Picard](http://broadinstitute.github.io/picard/) - a set of Java tools for manipulating high-throughput sequencing data (HTS) data and formats. + * [Picard](https://broadinstitute.github.io/picard/) - a set of Java tools for manipulating high-throughput sequencing data (HTS) data and formats. The following video highlights de-duplication, filtering, and cleaning of a BAM dataset using BAMtools and Picard tools: @@ -402,12 +402,12 @@ Perform a similar analyses with your own data. ## PCR duplicates -Preparation of sequencing libraries (at least at the time of writing) for technologies such as Illumina (used in this examples) involves PCR amplification. It is required to generate sufficient number of sequencing templates so that a reliable detection can be performed by base callers. Yet PCR has it's biases, which are especially profound in cases of multitemplate PCR used for construction of sequencing libraries (Kanagawa et al. [2003](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=16233530)). +Preparation of sequencing libraries (at least at the time of writing) for technologies such as Illumina (used in this examples) involves PCR amplification. It is required to generate sufficient number of sequencing templates so that a reliable detection can be performed by base callers. Yet PCR has it's biases, which are especially profound in cases of multitemplate PCR used for construction of sequencing libraries (Kanagawa et al. [2003](https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=16233530)). | | |----------------------------------------------| | ![](../../images/pcr-duplicates.png) | -|Analyzing molecules aligning with the same outer coordinates, a mapping quality of at least 30 and a length of at least 30nt, resulted in an average coverage of 12.9 per PCR duplicate and an empirical coverage distribution similar to an exponential/power law distribution (left upper panel). This indicates that many molecules are only observed for deeper sequencing while other molecules are available at higher frequencies. Analyzing length (left middle panel) and GC content (left lower panel) patterns as well as the combination (right panel) shows higher PCR duplicate counts for a GC content between 30% to 70% as well as for shorter molecules compared to longer molecules. This effect may be due to an amplification bias from the polymerase or the cluster generation process necessary for Illumina sequencing. From Ph.D. dissertation of [Martin Kircher](http://www.qucosa.de/fileadmin/data/qucosa/documents/7110/pflichtexemplar_final.pdf)).| +|Analyzing molecules aligning with the same outer coordinates, a mapping quality of at least 30 and a length of at least 30nt, resulted in an average coverage of 12.9 per PCR duplicate and an empirical coverage distribution similar to an exponential/power law distribution (left upper panel). This indicates that many molecules are only observed for deeper sequencing while other molecules are available at higher frequencies. Analyzing length (left middle panel) and GC content (left lower panel) patterns as well as the combination (right panel) shows higher PCR duplicate counts for a GC content between 30% to 70% as well as for shorter molecules compared to longer molecules. This effect may be due to an amplification bias from the polymerase or the cluster generation process necessary for Illumina sequencing. From Ph.D. dissertation of [Martin Kircher](https://www.qucosa.de/fileadmin/data/qucosa/documents/7110/pflichtexemplar_final.pdf)).| Duplicates can be identified based on their outer alignment coordinates or using sequence-based clustering. One of the common ways for identification of duplicate reads is the `MarkDuplicates` utility from [Picard](https://broadinstitute.github.io/picard/command-line-overview.html) package. It is designed to identify both PCR and optical duplicates: @@ -424,4 +424,4 @@ However, one has to be careful when removing duplicates in cases when the sequen | | |----------------------------------------------| | ![](../../images/sampling-bias.png) | -| The Variant Allele Frequency (VAF) bias determined by coverage and insert size variance. Reads are paired-end and read length is 76. The insert size distribution is modeled as a Gaussian distribution with mean at 200 and standard deviation shown on the x-axis. The true VAF is 0.05. The darkness at each position indicates the magnitude of the bias in the VAF. (From Zhou et al. [2013](http://bioinformatics.oxfordjournals.org/content/30/8/1073)). | +| The Variant Allele Frequency (VAF) bias determined by coverage and insert size variance. Reads are paired-end and read length is 76. The insert size distribution is modeled as a Gaussian distribution with mean at 200 and standard deviation shown on the x-axis. The true VAF is 0.05. The darkness at each position indicates the magnitude of the bias in the VAF. (From Zhou et al. [2013](https://bioinformatics.oxfordjournals.org/content/30/8/1073)). | diff --git a/topics/usegalaxy/tutorials/non-dip/tutorial.md b/topics/usegalaxy/tutorials/non-dip/tutorial.md index a6ff958a..b97b6592 100644 --- a/topics/usegalaxy/tutorials/non-dip/tutorial.md +++ b/topics/usegalaxy/tutorials/non-dip/tutorial.md @@ -9,8 +9,8 @@ tutorial_name: non-dip The majority of life on Earth is non-diploid and represented by prokaryotes, viruses and their derivatives such as our own mitochondria or plant's chloroplasts. In non-diploid systems allele frequencies can range anywhere between 0 and 100% and there could be multiple (not just two) alleles per locus. The main challenge associated with non-diploid variant calling is the difficulty in distinguishing between sequencing noise (abundant in all NGS platforms) and true low frequency variants. Some of the early attempts to do this well have been accomplished on human mitochondrial DNA although the same approaches will work equally good on viral and bacterial genomes: -* 2014 - [Maternal age effect and severe germ-line bottleneck in the inheritance of human mitochondrial DNA](http://www.pnas.org/content/111/43/15474.abstract) -* 2015 - [Extensive tissue-related and allele-related mtDNA heteroplasmy suggests positive selection for somatic mutations](http://www.pnas.org/content/112/8/2491.abstract). +* 2014 - [Maternal age effect and severe germ-line bottleneck in the inheritance of human mitochondrial DNA](https://www.pnas.org/content/111/43/15474.abstract) +* 2015 - [Extensive tissue-related and allele-related mtDNA heteroplasmy suggests positive selection for somatic mutations](https://www.pnas.org/content/112/8/2491.abstract). As an example of non-diploid system we will be using human mitochondrial genome as an example. However, this approach will also work for most bacterial and viral genomes as well. @@ -26,13 +26,13 @@ There are two ways one can call variants: In this tutorials we will take the *first* path is which we map reads against an existing assembly. Later in the course (after we learn about assembly approaches) we will try the second approach as well. -The goal of this example is to detect heteroplasmies (variants within mitochondrial DNA). Mitochondria is transmitted maternally and heteroplasmy frequencies may change dramatically and unpredictably during the transmission, due to a germ-line bottleneck [Cree:2008](http://www.nature.com/ng/journal/v40/n2/abs/ng.2007.63.html). As we mentioned above the procedure for finding variants in bacterial or viral genomes will be essentially the same. +The goal of this example is to detect heteroplasmies (variants within mitochondrial DNA). Mitochondria is transmitted maternally and heteroplasmy frequencies may change dramatically and unpredictably during the transmission, due to a germ-line bottleneck [Cree:2008](https://www.nature.com/ng/journal/v40/n2/abs/ng.2007.63.html). As we mentioned above the procedure for finding variants in bacterial or viral genomes will be essentially the same. [A Galaxy Library](https://usegalaxy.org/library/list#folders/Fe4842bd0c37b03a7) contains datasets representing a child and a mother. These datasets are obtained by paired-end Illumina sequencing of human genomic DNA enriched for mitochondria. The enrichment was performed using long-range PCR with two primer pairs that amplify the entire mitochondrial genome. This means that these samples still contain a lot of DNA from the nuclear genome, which, in this case, is a contaminant. # Importing example datasets -For this tutorial we have prepared a subset of data previously [published](http://www.pnas.org/content/111/43/15474.abstract) by our group. Let's import these data into Galaxy. +For this tutorial we have prepared a subset of data previously [published](https://www.pnas.org/content/111/43/15474.abstract) by our group. Let's import these data into Galaxy. > ### Data upload from a Galaxy Library > @@ -59,7 +59,7 @@ Before proceeding with the analysis, we need to find out how good the data actua > > ![](../../images/mt_qc.png) > ->QC'ing reads using [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Note that we selected all four datasets at once by pressing the middle button ![](../../images/mt_middle_button.png) adjacent to the **Short read data from your current history** widget. Once `FastQC` job runs, you will be able to look at the HTML reports generated by this tool. +>QC'ing reads using [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Note that we selected all four datasets at once by pressing the middle button ![](../../images/mt_middle_button.png) adjacent to the **Short read data from your current history** widget. Once `FastQC` job runs, you will be able to look at the HTML reports generated by this tool. > >The data have generally high quality in this example: > @@ -99,7 +99,7 @@ We can BAM dataset using **NGS: Picard** → **MergeSAMFiles** tool: ## Removing duplicates -Preparation of sequencing libraries (at least at the time of writing) for technologies such as Illumina (used in this example) involves PCR amplification. It is required to generate sufficient number of sequencing templates so that a reliable detection can be performed by base callers. Yet PCR has it's biases, which are especially profound in cases of multitemplate PCR used for construction of sequencing libraries (Kanagawa et al. [2003](http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=16233530)). +Preparation of sequencing libraries (at least at the time of writing) for technologies such as Illumina (used in this example) involves PCR amplification. It is required to generate sufficient number of sequencing templates so that a reliable detection can be performed by base callers. Yet PCR has it's biases, which are especially profound in cases of multitemplate PCR used for construction of sequencing libraries (Kanagawa et al. [2003](https://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=16233530)). Duplicates can be identified based on their outer alignment coordinates or using sequence-based clustering. One of the common ways for identification of duplicate reads is the `MarkDuplicates` utility from [Picard](https://broadinstitute.github.io/picard/command-line-overview.html) package. It is designed to identify both PCR and optical duplicates (the following is an excerpt from Picard documentation): @@ -137,7 +137,7 @@ In other words the two datasets had ~6% and ~9% duplicates, respectively. # Left-aligning indels -Left aligning of indels (a variant of re-aligning) is extremely important for obtaining accurate variant calls. This concept, while not difficult, requires some explanation. For illustrating how left-aligning works we expanded on an example provided by [Tan:2015](http://bioinformatics.oxfordjournals.org/content/31/13/2202.abstract). Suppose you have a reference sequence and a sequencing read: +Left aligning of indels (a variant of re-aligning) is extremely important for obtaining accurate variant calls. This concept, while not difficult, requires some explanation. For illustrating how left-aligning works we expanded on an example provided by [Tan:2015](https://bioinformatics.oxfordjournals.org/content/31/13/2202.abstract). Suppose you have a reference sequence and a sequencing read: ``` @@ -160,7 +160,7 @@ GGGCACACACAGGG Ref: GCA GGG--CACACAGGG Alt: G ``` -The last of these is *left-aligned*. In this case gaps (dashes) as moved as far left as possible (for a formal definition of left-alignment and variant normalization see [Tan:2015](http://bioinformatics.oxfordjournals.org/content/31/13/2202.abstract)). +The last of these is *left-aligned*. In this case gaps (dashes) as moved as far left as possible (for a formal definition of left-alignment and variant normalization see [Tan:2015](https://bioinformatics.oxfordjournals.org/content/31/13/2202.abstract)). Let's perform left alignment using **NGS: Variant Analysis** → **BamLeftAlign**: @@ -363,7 +363,7 @@ chrM 8557 . G C 2590.97 . AB=0.267066;ABP=790.051;AC=2;AF=0.5;AN=4;AO=446;CIGAR= ``` # Looking at the data -For visalizaning VCFs Galaxy relies on the two external tools. The first is called [VCF.IOBIO](http://vcf.iobio.io/) and is developed by [Gabor Marth's group](http://marthlab.org/) at the University of Utah. The second is called [IGV](http://software.broadinstitute.org/software/igv/) developed by Broad Institute. +For visalizaning VCFs Galaxy relies on the two external tools. The first is called [VCF.IOBIO](https://vcf.iobio.io/) and is developed by [Gabor Marth's group](http://marthlab.org/) at the University of Utah. The second is called [IGV](http://software.broadinstitute.org/software/igv/) developed by Broad Institute. ## VCF.IOBIO @@ -486,14 +486,14 @@ Time to really do it yourself. Please, complete the following exercise: > >Suppose you obtained a virus from some source and you would like to see how it is different from its published reference sequence. You have sequenced the virus and obtained two Illumina files (these files are large, so don't open them. Rather copy their addresses (right click) and use them to upload into Galaxy as explained in *Hints* section below): > ->- [Forward reads](http://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/f.fq.gz) ->- [Reverse reads](http://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/r.fq.gz) +>- [Forward reads](https://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/f.fq.gz) +>- [Reverse reads](https://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/r.fq.gz) > ->Analyze these files using Galaxy as was explained in this lesson by mapping them against [this reference genome](http://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/phix.fa) (again right click to copy the address); see *Tips*). +>Analyze these files using Galaxy as was explained in this lesson by mapping them against [this reference genome](https://www.bx.psu.edu/~anton/share/ng_test_data/bmmb554/hw4/phix.fa) (again right click to copy the address); see *Tips*). > > > ### :bulb: Tips > > -> > - You need to upload reads and the reference genome into Galaxy (http://usegalaxy.org) as shown in [this video](https://vimeo.com/120973708) +> > - You need to upload reads and the reference genome into Galaxy (https://usegalaxy.org) as shown in [this video](https://vimeo.com/120973708) > > - You will be mapping reads against an uploaded reference genome as shown in [this video](https://vimeo.com/123108417). > {: .tip} {: .comment} diff --git a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md index f4a4fb27..582c93e7 100644 --- a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md +++ b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md @@ -63,10 +63,10 @@ During a typical RNAseq experiment the information about strandedness is lost af Depending on the approach and whether one performs single- or paired-end sequencing there are multiple possibilities on how to interpret the results of mapping of these reads onto genome/transcriptome: ->[![](../../images/lib_type.png)](http://sailfish.readthedocs.org/en/master/library_type.html) +>[![](../../images/lib_type.png)](https://sailfish.readthedocs.org/en/master/library_type.html) > >**Effects of RNAseq library types**
    ->Image and description below is from [Sailfish documentation](http://sailfish.readthedocs.org/en/master/library_type.html) +>Image and description below is from [Sailfish documentation](https://sailfish.readthedocs.org/en/master/library_type.html) The relative orientation of the reads is only relevant if the library is pair-ended. The possible options are: @@ -90,7 +90,7 @@ So by combining the relative orientation of reads is I, O, or M (if reads are pa However, in practice, if you use Illumina paired-end RNAseq protocols you are unlikely to uncover many of these possibilities. You will either deal with: * unstranded RNAseq data (**IU** type from above. Also called **fr-unstranded** in TopHat/Cufflinks jargon); - * stranded RNAseq data produced with Illumina TrueSeq RNAseq kits and [dUTP tagging](http://nar.oxfordjournals.org/content/37/18/e123) (**ISR** type from above or **fr-firststrand** in TopHat/Cufflinks nomenclature). + * stranded RNAseq data produced with Illumina TrueSeq RNAseq kits and [dUTP tagging](https://nar.oxfordjournals.org/content/37/18/e123) (**ISR** type from above or **fr-firststrand** in TopHat/Cufflinks nomenclature). The implication of stranded RNAseq is that you can distinguish whether the reads are derived from forward- or reverse-encoded transcripts: @@ -108,7 +108,7 @@ An RNAseq experiment without a sufficient number of replicates will be a waste o >* **Biological replicates**. There is an on-going debate over what kinds of samples represent true biological replicates. Obviously, the variability between different samples will be greater between RNA extracted from two unrelated humans than between RNA extracted from two different batches of the same cell line. In the latter case, most of the variation that will eventually be detected was probably introduced by the experimenter (e.g., slightly differing media and plating conditions). Nevertheless, this is variation the researcher is typically not interested in assessing, therefore the ENCODE consortium defines biological replicates as RNA from an independent growth of cells/tissue (ENCODE [2011](https://genome.ucsc.edu/ENCODE/protocols/dataStandards/ENCODE_RNAseq_Standards_V1.0.pdf)). ->The number of replicates should be as high as practically possible. Most RNAseq experiments include three replicates and some have as many as 12 (see Schurch et al. [2015](http://arxiv.org/abs/1505.02017)). +>The number of replicates should be as high as practically possible. Most RNAseq experiments include three replicates and some have as many as 12 (see Schurch et al. [2015](https://arxiv.org/abs/1505.02017)). ## Read mapping @@ -121,12 +121,12 @@ After sequencing is performed you have a collection of sequencing reads for each ### TopHat, TopHat2, and HiSat -[Tophat](http://bioinformatics.oxfordjournals.org/content/25/9/1105.abstract) was one of the first tools designed specifically to address this problem by identifying potential exons using reads that do map to the genome, generating possible splices between neighboring exons, and comparing reads that did not initially map to the genome agaisnt these *in silico* created junctions: +[Tophat](https://bioinformatics.oxfordjournals.org/content/25/9/1105.abstract) was one of the first tools designed specifically to address this problem by identifying potential exons using reads that do map to the genome, generating possible splices between neighboring exons, and comparing reads that did not initially map to the genome agaisnt these *in silico* created junctions: ->[![](../../images/tophat.png)](http://bioinformatics.oxfordjournals.org/content/25/9/1105/F1.expansion.html) +>[![](../../images/tophat.png)](https://bioinformatics.oxfordjournals.org/content/25/9/1105/F1.expansion.html) > >**TopHat and TopHat2: Mapping RNAseq regions to genome**
    ->In TopHat reads are mapped against the genome and are separated into two categories: (1) those that map, and (2) those that initially unmapped (IUM). "Piles" of reads representing potential exons are extended in search of potential donor/acceptor splice sites and potential splice junctions are reconstructed. IUMs are then mapped to these junctions. Image from [Trapnell:2009](http://bioinformatics.oxfordjournals.org/content/25/9/1105.full). +>In TopHat reads are mapped against the genome and are separated into two categories: (1) those that map, and (2) those that initially unmapped (IUM). "Piles" of reads representing potential exons are extended in search of potential donor/acceptor splice sites and potential splice junctions are reconstructed. IUMs are then mapped to these junctions. Image from [Trapnell:2009](https://bioinformatics.oxfordjournals.org/content/25/9/1105.full). >[![](../../images/tophat2.png)](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) @@ -134,7 +134,7 @@ After sequencing is performed you have a collection of sequencing reads for each >**TopHat has been subsequently improved with the development of TopHat2**
    >Image from [Kim:2012](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2013-14-4-r36) summarizes steps involved in aligning of RNAseq reads with TopHat2 -To further optimize and speed up spliced read alignment Kim at al. [2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) developed [HISAT](http://ccb.jhu.edu/software/hisat2/index.shtml). It uses a set of [FM-indices](https://en.wikipedia.org/wiki/FM-index) consisting one global genome-wide index and a collection of ~48,000 local overlapping 42 kb indices (~55,000 56 kb indices in HiSat2). This allows to find initial seed locations for potential read alignments in the genome using global index and to rapidly refine these alignments using a corresponding local index: +To further optimize and speed up spliced read alignment Kim at al. [2015](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4655817/) developed [HISAT](https://ccb.jhu.edu/software/hisat2/index.shtml). It uses a set of [FM-indices](https://en.wikipedia.org/wiki/FM-index) consisting one global genome-wide index and a collection of ~48,000 local overlapping 42 kb indices (~55,000 56 kb indices in HiSat2). This allows to find initial seed locations for potential read alignments in the genome using global index and to rapidly refine these alignments using a corresponding local index: >![](../../images/hisat.png) @@ -144,25 +144,25 @@ To further optimize and speed up spliced read alignment Kim at al. [2015](https: ### STAR mapper -[STAR aligner](https://github.com/alexdobin/STAR) is a fast alternative for mapping RNAseq reads against genome utilizing uncompressed [suffix array](https://en.wikipedia.org/wiki/Suffix_array). It operates in [two stages](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract). In the first stage it performs seed search: +[STAR aligner](https://github.com/alexdobin/STAR) is a fast alternative for mapping RNAseq reads against genome utilizing uncompressed [suffix array](https://en.wikipedia.org/wiki/Suffix_array). It operates in [two stages](https://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract). In the first stage it performs seed search: >![](../../images/star.png) > >**STAR's seed search**
    ->Here a read is split between two consecutive exons. STAR starts to look for a *maximum mappable prefix* (MMP) from the beginning of the read until it can no longer match continuously. After this point it start to MMP for the unmatched portion of the read (**a**). In the case of mismatches (**b**) and unalignable regions (**c**) MMPs serve as anchors from which to extend alignments. Image from [Dobin:2013](http://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html). +>Here a read is split between two consecutive exons. STAR starts to look for a *maximum mappable prefix* (MMP) from the beginning of the read until it can no longer match continuously. After this point it start to MMP for the unmatched portion of the read (**a**). In the case of mismatches (**b**) and unalignable regions (**c**) MMPs serve as anchors from which to extend alignments. Image from [Dobin:2013](https://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.full.pdf+html). At the second stage STAR stitches MMPs to generate read-level alignments that (contrary to MMPs) can contain mismatches and indels. A scoring scheme is used to evaluate and prioritize stitching combinations and to evaluate reads that map to multiple locations. STAR is extremely fast but requires a substantial amount of RAM to run efficiently. ## Transcript reconstruction -The previous step - mapping - assigns RNAseq reads to genomic locations and identifies splice junctions from reads that originate from different exons. At transcript reconstruction step this information is taken further in attempt to build transcript models. There is a number of tools for performing this task. A benchmarking paper by [Hayer:2015](http://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html) attempted to compare performance of existing approaches with one of the outcomes shown below: +The previous step - mapping - assigns RNAseq reads to genomic locations and identifies splice junctions from reads that originate from different exons. At transcript reconstruction step this information is taken further in attempt to build transcript models. There is a number of tools for performing this task. A benchmarking paper by [Hayer:2015](https://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html) attempted to compare performance of existing approaches with one of the outcomes shown below: ->[![](../../images/rnaseq_comparison.png)](http://bioinformatics.oxfordjournals.org/content/early/2015/09/08/bioinformatics.btv488/F5.large.jpg) +>[![](../../images/rnaseq_comparison.png)](https://bioinformatics.oxfordjournals.org/content/early/2015/09/08/bioinformatics.btv488/F5.large.jpg) > >**Comparison of transcript reconsruction approaches**
    ->Here *recall* (the number of correctly constructed forms divided by the total number of real forms) versus *precision* (true positives divided by the sum of true positives and false positives) are plotted for seven transcript assemblers tested on two simulated datasets: *EnsemblPerfect* and *EnsemblRealistic*. The shaded region is indicating suboptimal performance (i.e., the white, unshaded region is "good"). The figure is from [Hayer:2015](http://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html). +>Here *recall* (the number of correctly constructed forms divided by the total number of real forms) versus *precision* (true positives divided by the sum of true positives and false positives) are plotted for seven transcript assemblers tested on two simulated datasets: *EnsemblPerfect* and *EnsemblRealistic*. The shaded region is indicating suboptimal performance (i.e., the white, unshaded region is "good"). The figure is from [Hayer:2015](https://bioinformatics.oxfordjournals.org/content/early/2015/09/03/bioinformatics.btv488.full.pdf+html). -Based on these results [Cufflinks](http://cole-trapnell-lab.github.io/cufflinks/) and [StringTie](https://ccb.jhu.edu/software/stringtie/) have satisfactory performence. The following discussion is based on inner workings of StringTie. +Based on these results [Cufflinks](https://cole-trapnell-lab.github.io/cufflinks/) and [StringTie](https://ccb.jhu.edu/software/stringtie/) have satisfactory performence. The following discussion is based on inner workings of StringTie. ### Transcriptome assembly with StringTie @@ -181,38 +181,38 @@ Transcriptome quantification attempts to estimate expression levels of individua ### Assigning reads to transcripts -To associate reads with transcripts they (the reads) need to be aligned to the transcriptome. Tools like Cufflinks and StringTie reconstruct transcripts from spliced read alignments generated by other programs (TopHat, HISAT, STAR), so they already have the information about which reads belong to each reconstructed transcript. Other tools such as [Sailfish](http://www.cs.cmu.edu/~ckingsf/software/sailfish/), [Kallisto](http://pachterlab.github.io/kallisto/), and [Salmon](http://combine-lab.github.io/salmon/) perform *lightweight* alignment of RNAseq reads against existing transcriptome sequences. The goal of lightweight alignment is to quickly distribute the reads across transcripts they likely originate from without worrying too much about producing high quality alignments. The upside of this is that the entire procedure can be performed very quickly. The downside is that these tools require high quality transcriptome as input, which is not a problem if you work with humans or mice, but is a problem if you are studying Hyacinth macaw or any other brilliantly colored creatures. +To associate reads with transcripts they (the reads) need to be aligned to the transcriptome. Tools like Cufflinks and StringTie reconstruct transcripts from spliced read alignments generated by other programs (TopHat, HISAT, STAR), so they already have the information about which reads belong to each reconstructed transcript. Other tools such as [Sailfish](https://www.cs.cmu.edu/~ckingsf/software/sailfish/), [Kallisto](http://pachterlab.github.io/kallisto/), and [Salmon](http://combine-lab.github.io/salmon/) perform *lightweight* alignment of RNAseq reads against existing transcriptome sequences. The goal of lightweight alignment is to quickly distribute the reads across transcripts they likely originate from without worrying too much about producing high quality alignments. The upside of this is that the entire procedure can be performed very quickly. The downside is that these tools require high quality transcriptome as input, which is not a problem if you work with humans or mice, but is a problem if you are studying Hyacinth macaw or any other brilliantly colored creatures. #### Lightweight alignment -[Sailfish](http://www.cs.cmu.edu/~ckingsf/software/sailfish/) has been initially designed to utilize [*k*-mer](https://en.wikipedia.org/wiki/K-mer) matching for finding association between reads and corresponding transcripts: +[Sailfish](https://www.cs.cmu.edu/~ckingsf/software/sailfish/) has been initially designed to utilize [*k*-mer](https://en.wikipedia.org/wiki/K-mer) matching for finding association between reads and corresponding transcripts: >![](../../images/sailfish.png) > >**Assigning reads to transcripts: Sailfish**
    >Sailfish indexes input transcriptome for a fixed *k*-mer length and compares *k*-mers derived from RNAseq reads against this index. Image from [Patro:2014](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4077321/) -The current version of Sailfish uses [quasi-alignment](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) to extend exact matches found with *k*-mers: +The current version of Sailfish uses [quasi-alignment](https://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) to extend exact matches found with *k*-mers: >![](../../images/quasi_aln.png) > >**Quasi-alignment of reads in Sailfish**
    ->In Sailfish version [0.7.0](https://github.com/kingsfordgroup/sailfish/releases/tag/v0.7.0) and up transcriptome is concatenated into a single sequence using `$` separators from which a [suffix array](https://en.wikipedia.org/wiki/Suffix_array) and a [hash table](https://en.wikipedia.org/wiki/Hash_table) are constructed. A *k*-mer from an RNAseq read (green) is looked up in the hash table, which immediately gives its position in the suffix array allowing to extend the march as described in the legend and the [paper](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf). Image from [Srivastava:2015](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) +>In Sailfish version [0.7.0](https://github.com/kingsfordgroup/sailfish/releases/tag/v0.7.0) and up transcriptome is concatenated into a single sequence using `$` separators from which a [suffix array](https://en.wikipedia.org/wiki/Suffix_array) and a [hash table](https://en.wikipedia.org/wiki/Hash_table) are constructed. A *k*-mer from an RNAseq read (green) is looked up in the hash table, which immediately gives its position in the suffix array allowing to extend the march as described in the legend and the [paper](https://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf). Image from [Srivastava:2015](http://biorxiv.org/content/biorxiv/early/2015/10/22/029652.full.pdf) -[Kallisto](http://pachterlab.github.io/kallisto/) also utilizes *k*-mer matching but uses a different data structure. It constructs a [De Bruijn graph](https://en.wikipedia.org/wiki/De_Bruijn_graph) from transcriptome input (pane **b** of the figure below). This graph is different from De Bruijn graphs used for genome assembly in that its nodes are *k*-mers and transcripts correspond to paths through the graph. To accommodate multiple transcripts that can lay along the same path (or sub-path) the paths are "colored" with each transcript given a distinct "color" (in genome assembly the graph is built from the reads and nodes usually correspond to overlaps between *k*-mers forming incoming and outgoing edges). Non-branching sections of the graph that have identical coloring are "glued" into contigs. Finally a [hash table](https://en.wikipedia.org/wiki/Hash_table) is built that stores the position of each transcriptome *k*-mer within the graph: +[Kallisto](https://pachterlab.github.io/kallisto/) also utilizes *k*-mer matching but uses a different data structure. It constructs a [De Bruijn graph](https://en.wikipedia.org/wiki/De_Bruijn_graph) from transcriptome input (pane **b** of the figure below). This graph is different from De Bruijn graphs used for genome assembly in that its nodes are *k*-mers and transcripts correspond to paths through the graph. To accommodate multiple transcripts that can lay along the same path (or sub-path) the paths are "colored" with each transcript given a distinct "color" (in genome assembly the graph is built from the reads and nodes usually correspond to overlaps between *k*-mers forming incoming and outgoing edges). Non-branching sections of the graph that have identical coloring are "glued" into contigs. Finally a [hash table](https://en.wikipedia.org/wiki/Hash_table) is built that stores the position of each transcriptome *k*-mer within the graph: >![](../../images/kallisto.png) > >**Assigning reads to transcripts: Kallisto**
    ->Here a black read is being associated with a set consisting of red, blue, and green transcripts (**a**). First, a graph is built from transcriptome (**b**). Next, by finding common *k*-mers between the read and the graph the read is "threaded" along a path (**c** and **d**). The colors along that path would indicate which transcripts it is likely derived from. Specifically, this is done by taking intersection of "colors" (**c**). It this case the read is assigned to two transcripts: red and blue. Image from [Bray:2015](http://arxiv.org/pdf/1505.02710v2.pdf) +>Here a black read is being associated with a set consisting of red, blue, and green transcripts (**a**). First, a graph is built from transcriptome (**b**). Next, by finding common *k*-mers between the read and the graph the read is "threaded" along a path (**c** and **d**). The colors along that path would indicate which transcripts it is likely derived from. Specifically, this is done by taking intersection of "colors" (**c**). It this case the read is assigned to two transcripts: red and blue. Image from [Bray:2015](https://arxiv.org/pdf/1505.02710v2.pdf) [Salmon](https://combine-lab.github.io/salmon/about/) does not use *k*-mer matching approach. Instead it creates [bwa](https://github.com/lh3/bwa)-like [FM-index](https://en.wikipedia.org/wiki/FM-index) and uses it to finds chains of *Maximal Exact Matches* (MEMs) and *Super Maximal Exact Matches* (SMEMs) between a read and the transcriptome. -[Patro:2015](http://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) define a MEM as "*a substring that is shared by the query (read) and reference (transcript) that cannot be extended in either direction without introducing a mismatch*". Similraly, a SMEM is defined as a "*MEM that is not contained within any other MEM on the query.*" One of the advantages of utilizing the FM-index is that a new index does not need to re-generated for a search with different set of parameters. In the case of Sailfish and Kallisto an index is dependent on *k*-mer length and has to be recomputed every time the *k* is changed. The overall schematics of Salmon operation is as follows: +[Patro:2015](https://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) define a MEM as "*a substring that is shared by the query (read) and reference (transcript) that cannot be extended in either direction without introducing a mismatch*". Similraly, a SMEM is defined as a "*MEM that is not contained within any other MEM on the query.*" One of the advantages of utilizing the FM-index is that a new index does not need to re-generated for a search with different set of parameters. In the case of Sailfish and Kallisto an index is dependent on *k*-mer length and has to be recomputed every time the *k* is changed. The overall schematics of Salmon operation is as follows: >![](../../images/salmon.png) > >**Assigning reads to transcripts: Salmon**
    ->Image from [Patro:2015](http://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) +>Image from [Patro:2015](https://biorxiv.org/content/biorxiv/early/2015/06/27/021592.full.pdf) ### Estimating transcript levels @@ -248,7 +248,7 @@ During next expectation stage read are re-apportioned across transcripts and the >![](../../images/em.png) > >**Expectation Maximization (EM)**
    ->Image from [Pacher:2011](http://arxiv.org/pdf/1104.3889v2.pdf) +>Image from [Pacher:2011](https://arxiv.org/pdf/1104.3889v2.pdf) ### Understanding quantification metrics @@ -275,12 +275,12 @@ The goal of differential expression analysis (DE) is to find gene (DGE) or trans * Estimate the *magnitude* of expression differences; * Estimate the *significance* of expression differences. -For this expression is estimated from read counts and attempts are made to correct for variability in measurements using replicates that are absolutely essential accurate results (see below). We begin our short discussion on DE by reproducing a figure from [Trapnell:2013](http://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) highlighting some of the challenges associated with judging expression differences from read counts: +For this expression is estimated from read counts and attempts are made to correct for variability in measurements using replicates that are absolutely essential accurate results (see below). We begin our short discussion on DE by reproducing a figure from [Trapnell:2013](https://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) highlighting some of the challenges associated with judging expression differences from read counts: >![](../../images/diff.png) > >**Differential expression: Read counts and Expression levels**
    ->**Change in fragment count for a gene does not necessarily equal a change in expression**. (**a**) Simple read-counting schemes sum the fragments incident on a gene’s exons. The exon-union model counts reads falling on any of a gene’s exons, whereas the exon-intersection model counts only reads on constitutive exons. (**b**) Both of the exon-union and exon intersection counting schemes may incorrectly estimate a change in expression in genes with multiple isoforms. The true expression is estimated by the sum of the length-normalized isoform read counts. The discrepancy between a change in the union or intersection count and a change in gene expression is driven by a change in the abundance of the isoforms with respect to one another. In the top row, the gene generates the same number of reads in conditions A and B, but in condition B, all of the reads come from the shorter of the two isoforms, and thus the true expression for the gene is higher in condition B. The intersection count scheme underestimates the true change in gene expression, and the union scheme fails to detect the change entirely. In the middle row, the intersection count fails to detect a change driven by a shift in the dominant isoform for the gene. The union scheme detects a shift in the wrong direction. In the bottom row, the gene’s expression is constant, but the isoforms undergo a complete switch between conditions A and B. Both simplified counting schemes register a change in count that does not reflect a change in gene expression. Figure from [Trapnell:2013] (http://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) +>**Change in fragment count for a gene does not necessarily equal a change in expression**. (**a**) Simple read-counting schemes sum the fragments incident on a gene’s exons. The exon-union model counts reads falling on any of a gene’s exons, whereas the exon-intersection model counts only reads on constitutive exons. (**b**) Both of the exon-union and exon intersection counting schemes may incorrectly estimate a change in expression in genes with multiple isoforms. The true expression is estimated by the sum of the length-normalized isoform read counts. The discrepancy between a change in the union or intersection count and a change in gene expression is driven by a change in the abundance of the isoforms with respect to one another. In the top row, the gene generates the same number of reads in conditions A and B, but in condition B, all of the reads come from the shorter of the two isoforms, and thus the true expression for the gene is higher in condition B. The intersection count scheme underestimates the true change in gene expression, and the union scheme fails to detect the change entirely. In the middle row, the intersection count fails to detect a change driven by a shift in the dominant isoform for the gene. The union scheme detects a shift in the wrong direction. In the bottom row, the gene’s expression is constant, but the isoforms undergo a complete switch between conditions A and B. Both simplified counting schemes register a change in count that does not reflect a change in gene expression. Figure from [Trapnell:2013] (https://www.nature.com/nbt/journal/v31/n1/abs/nbt.2450.html) The following discussion of DGE logic is reproduced from [Dündar:2015](http://chagall.med.cornell.edu/RNASEQcourse/). @@ -297,7 +297,7 @@ gene **i** is quite small. >In contrast to the Poisson distribution, we now need to estimate two parameters from the read counts: the mean as well as the dispersion. The precision of these estimates strongly depends on the number (and variation) of replicates – the more replicates, the better the grasp on the underlying mean expression values of unchanged genes and the variance that is due to biological variation rather than the experimental treatment. For most RNA-seq experiments, only two to three replicates are available, which is not enough for reliable mean and variance estimates. Some tools therefore compensate for the lack of replication by borrowing information across genes with similar expression values and shrink a given gene’s variance towards the regressed values. These fitted values of the mean and dispersion are then used instead of the raw estimates to test for differential gene expression. > ->The best performing tools tend to be [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), [DESeq/DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html), and [limma-voom](https://www.bioconductor.org/packages/release/bioc/html/limma.html) (see Rapaport et al. ([2013](http://link.springer.com/article/10.1186/gb-2013-14-9-r95)); Soneson and Delorenzi ([2013](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-91)); Schurch et al. ([2015](http://arxiv.org/abs/1505.02017)) for reviews of DGE tools). DESeq and limma-voom tend to be more conservative than edgeR (better control of false positives), but edgeR is recommended for experiments with fewer than 12 replicates (Schurch et al., [2015](http://arxiv.org/abs/1505.02017)). +>The best performing tools tend to be [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), [DESeq/DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html), and [limma-voom](https://www.bioconductor.org/packages/release/bioc/html/limma.html) (see Rapaport et al. ([2013](https://link.springer.com/article/10.1186/gb-2013-14-9-r95)); Soneson and Delorenzi ([2013](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-91)); Schurch et al. ([2015](http://arxiv.org/abs/1505.02017)) for reviews of DGE tools). DESeq and limma-voom tend to be more conservative than edgeR (better control of false positives), but edgeR is recommended for experiments with fewer than 12 replicates (Schurch et al., [2015](http://arxiv.org/abs/1505.02017)). ## Let's try it @@ -354,7 +354,7 @@ Let's now take a look at some of the alignments. We will use IGV for this purpos > >![](../../images/igv_tophat.png) > ->and [sashimi plots](http://software.broadinstitute.org/software/igv/Sashimi) highlighting potential splice junctions: +>and [sashimi plots](https://software.broadinstitute.org/software/igv/Sashimi) highlighting potential splice junctions: > >![](../../images/sashimi.png) @@ -364,12 +364,12 @@ Using mapped reads produced by TopHat we will perform analysis of differential g #### Gene-based read counting with HTseq-count -[`HTSeq-count`](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html) is one of the most popular tools for gene quantification. `HTseq-count` gives you multiple choices on how to handle read mapping to multiple locations, reads overlapping introns, or reads that overlap more than one genomic feature: +[`HTSeq-count`](https://www-huber.embl.de/users/anders/HTSeq/doc/count.html) is one of the most popular tools for gene quantification. `HTseq-count` gives you multiple choices on how to handle read mapping to multiple locations, reads overlapping introns, or reads that overlap more than one genomic feature: ->[![](../../images/htseq_count.png)](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html) +>[![](../../images/htseq_count.png)](https://www-huber.embl.de/users/anders/HTSeq/doc/count.html) > >**`HTseq-count` read/feature overlap modes**
    ->The `htseq-count` script of the HTSeq suite offers three different modes to handle details of read–feature overlaps that are depicted here. The default of featureCounts is the behavior of the union option. Image is from [HTseq documentation](http://www-huber.embl.de/users/anders/HTSeq/doc/count.html); Caption by [Dündar:2015](http://chagall.med.cornell.edu/RNASEQcourse/) +>The `htseq-count` script of the HTSeq suite offers three different modes to handle details of read–feature overlaps that are depicted here. The default of featureCounts is the behavior of the union option. Image is from [HTseq documentation](https://www-huber.embl.de/users/anders/HTSeq/doc/count.html); Caption by [Dündar:2015](http://chagall.med.cornell.edu/RNASEQcourse/) Before we can use `HTseq-count` we need to download gene annotations for version `dm3` of the *Drosophila melanogaster* genome. We use version `dm3` because it is the same genome we have mapped reads against during the TopHat step. @@ -378,7 +378,7 @@ Before we can use `HTseq-count` we need to download gene annotations for version >Select **UCSC Main** from **Get Data** section of the menu. Within the UCSC Genome Browser interface set parameters as shown below. In particular make sure that **assembly** is set ti `dm3` and **output format** is set to `GTF`. Click **get output**. >[![](../../images/ucsc_dm3.png)](../../images/ucsc_dm3.png) > ->This [GTF](http://www.ensembl.org/info/website/upload/gff.html) dataset will be used one of the input for HTseq-count. +>This [GTF](https://www.ensembl.org/info/website/upload/gff.html) dataset will be used one of the input for HTseq-count. `HTseq-count` takes two inputs: (1) mapped reads in BAM format and (2) a GTF dataset containing annotation of genes. Using these inputs it will compute the number of reads per gene. diff --git a/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md b/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md index 24a07d21..a7475483 100644 --- a/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md +++ b/topics/variant-analysis/tutorials/diploid-variant-calling/tutorial.md @@ -8,7 +8,7 @@ tutorial_name: diploid-variant-calling # Introduction -Variant calling is a complex field that was significantly propelled by advances in DNA sequencing and efforts of large scientific consortia such as the [1000 Genomes](http://www.1000genomes.org). Here we summarize basic ideas central to Genotype and Variant calling. First, let's contrast the two things although they often go together: +Variant calling is a complex field that was significantly propelled by advances in DNA sequencing and efforts of large scientific consortia such as the [1000 Genomes](https://www.1000genomes.org). Here we summarize basic ideas central to Genotype and Variant calling. First, let's contrast the two things although they often go together: * **Variant calling** - identification of positions where the sequenced sample is different from the reference sequence (or [reference genome graph](https://github.com/vgteam/vg)); * **Genotype calling** - identifying individual's genotype at variable sites. @@ -23,7 +23,7 @@ A typical workflow for variation discovery involves the following steps (e.g., s 6. Performing filtering and genotype quality score recalibration; 7. Annotating variants and performing downstream analyses. -However, continuing evolution of variant detection methods has made some of these steps obsolete. For instance, omitting quality score recalibration and re-alignment (steps 3 and 4 above) when using haplotype-aware variant callers such as [FreeBayes](https://github.com/ekg/freebayes) does not have an effect on the resulting calls (see Brad Chapman's methodological comparisons at [bcbio](http://bit.ly/1S9kFJN) +However, continuing evolution of variant detection methods has made some of these steps obsolete. For instance, omitting quality score recalibration and re-alignment (steps 3 and 4 above) when using haplotype-aware variant callers such as [FreeBayes](https://github.com/ekg/freebayes) does not have an effect on the resulting calls (see Brad Chapman's methodological comparisons at [bcbio](https://bit.ly/1S9kFJN) > ### Agenda > @@ -102,7 +102,7 @@ Suppose *Ri* is a base in read *i* corresponding to a genome position #### *P(G)* - a single sample case -One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. +One can assign an equal probability to all possible genotypes, or to source this information based on previously obtained knowledge containing in a database, such as [dbSNP](https://www.ncbi.nlm.nih.gov/SNP/). In this case (as exemplified in [Nielsen et al.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3593722/) we may, for instance, have a site with a **G/T** polymorphism and genotypes **GG**, **TT**, and **GT** having frequencies of 0.45, 0.45, 0.09, respectively. We will use these values as priors. #### *P(G)* - a multi-sample case @@ -123,7 +123,7 @@ This makes it highly unlikely that **AA** is a true genotype of this individual. > * **Variant quality recalibration is avoided** by incorporating a number of metrics, such as read placement bias and allele balance, directly into the Bayesian model; > * **Ability to incorporate non-diploid cases** such as pooled datasets or data from polyploid samples. -Freebayes is a *haplotype-based* variant caller. This implies that instead of looking at an individual positions within an alignment of reads to the reference genome, it looks at a haplotype window, length of which is dynamically determined (see section 3.2. in [FreeBayes manuscript](http://arxiv.org/pdf/1207.3907v2.pdf)): +Freebayes is a *haplotype-based* variant caller. This implies that instead of looking at an individual positions within an alignment of reads to the reference genome, it looks at a haplotype window, length of which is dynamically determined (see section 3.2. in [FreeBayes manuscript](https://arxiv.org/pdf/1207.3907v2.pdf)): |Haplotype-based calling | |------------------------| @@ -140,7 +140,7 @@ In this example we will perform variant calling and annotation using [genome in * HG003- NA24149 - hu6E4515 (father) * HG004- NA24143 - hu8E87A9 (mother) -Yet for a quick tutorial these datasets are way too big, so we created a [downsampled dataset](http://dx.doi.org/10.5281/zenodo.60520). This dataset was produced by mapping the trio reads against `hg19` version of the human genome, merging the resulting bam files together (we use readgroups to label individual reads so they can be traced to each of the original individuals), and restricting alignments to a small portion of chromosome 19 containing the [*POLRMT*](http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=5442) gene. +Yet for a quick tutorial these datasets are way too big, so we created a [downsampled dataset](https://dx.doi.org/10.5281/zenodo.60520). This dataset was produced by mapping the trio reads against `hg19` version of the human genome, merging the resulting bam files together (we use readgroups to label individual reads so they can be traced to each of the original individuals), and restricting alignments to a small portion of chromosome 19 containing the [*POLRMT*](http://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=Graphics&list_uids=5442) gene. > ### :pencil2: Hands-on: Variant calling > @@ -173,7 +173,7 @@ Yet for a quick tutorial these datasets are way too big, so we created a [downsa > ![](../../images/freebayes_gq.png) {: .hands_on} -This produces a dataset in [VCF](http://www.1000genomes.org/wiki/Analysis/variant-call-format) format containing 35 putative variants. Before we can continue we need to post-process this dataset by breaking compound variants into multiple independent variants with **VcfAllelicPrimitives** tool found within **VCF Tools** section. This is necessary for ensuring the smooth sailing through downstream analyses: +This produces a dataset in [VCF](https://www.1000genomes.org/wiki/Analysis/variant-call-format) format containing 35 putative variants. Before we can continue we need to post-process this dataset by breaking compound variants into multiple independent variants with **VcfAllelicPrimitives** tool found within **VCF Tools** section. This is necessary for ensuring the smooth sailing through downstream analyses: > ### :pencil2: Hands-on: Post-processing > @@ -192,7 +192,7 @@ This produces a dataset in [VCF](http://www.1000genomes.org/wiki/Analysis/varian ## Annotating variants with SnpEff -At this point we are ready to begin annotating variants using [**SnpEff**](http://snpeff.sourceforge.net/SnpEff.html). **SnpEff**, a project maintained by [Pablo Cingolani](https://www.linkedin.com/in/pablocingolani) "*...annotates and predicts the effects of variants on genes (such as amino acid changes)...*" and so is critical for functional interpretation of variation data. +At this point we are ready to begin annotating variants using [**SnpEff**](https://snpeff.sourceforge.net/SnpEff.html). **SnpEff**, a project maintained by [Pablo Cingolani](https://www.linkedin.com/in/pablocingolani) "*...annotates and predicts the effects of variants on genes (such as amino acid changes)...*" and so is critical for functional interpretation of variation data. ### :pencil2: Annotating variants @@ -209,7 +209,7 @@ or changes to codons ## Manipulating variation data with GEMINI -Now that we have an annotated VCF file it is time to peek inside our variation data. [Aaron Quinlan](http://quinlanlab.org/), creator of [GEMINI](http://gemini.readthedocs.org/en/latest/index.html), calls it *Detective work*. +Now that we have an annotated VCF file it is time to peek inside our variation data. [Aaron Quinlan](https://quinlanlab.org/), creator of [GEMINI](http://gemini.readthedocs.org/en/latest/index.html), calls it *Detective work*. ### Loading data into GEMINI @@ -239,11 +239,11 @@ This produce a list of all tables and fields in the database. ### Querying GEMINI database -GEMINI database is queried using the versatile SQL language (more on SQL [here](http://swcarpentry.github.io/sql-novice-survey)). In Galaxy's version of GEMINI this is done using **GEMINI_query** tool. Within this tool SQL commands are typed directly into the **The query to be issued to the database** text box. Let's begin getting information from some of the tables we discovered with **GEMINI_db_info** tool above. +GEMINI database is queried using the versatile SQL language (more on SQL [here](https://swcarpentry.github.io/sql-novice-survey)). In Galaxy's version of GEMINI this is done using **GEMINI_query** tool. Within this tool SQL commands are typed directly into the **The query to be issued to the database** text box. Let's begin getting information from some of the tables we discovered with **GEMINI_db_info** tool above. > ### :bulb: Tip: Gemini tutorials > -> The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/gemini-tutorials/Intro-To-Gemini.pdf)" tutorial. For extensive documentation see "[Querying GEMINI](http://gemini.readthedocs.org/en/latest/content/querying.html)". +> The examples below are taken from "[Intro to Gemini](https://s3.amazonaws.com/gemini-tutorials/Intro-To-Gemini.pdf)" tutorial. For extensive documentation see "[Querying GEMINI](https://gemini.readthedocs.org/en/latest/content/querying.html)". {: .tip} > ### :pencil2: Hands-on: Selecting "novel" variants that are not annotated in dbSNP database @@ -258,7 +258,7 @@ GEMINI database is queried using the versatile SQL language (more on SQL [here]( > ### :pencil2: Find variants in POLRMT gene > -> The query `SELECT * FROM variants WHERE filter is NULL and gene = 'POLRMT'` will produce [output](https://usegalaxy.org/datasets/bbd44e69cb8906b5a0bb5b2cc0695697/display/?preview=True) with very large number of columns. To restrict the number of columns to a manageable set let's use this command: `SELECT rs_ids, aaf_esp_ea, impact, clinvar_disease_name, clinvar_sig FROM variants WHERE filter is NULL and gene = 'POLRMT'` (column definitions can be found [here](http://gemini.readthedocs.org/en/latest/content/database_schema.html)) +> The query `SELECT * FROM variants WHERE filter is NULL and gene = 'POLRMT'` will produce [output](https://usegalaxy.org/datasets/bbd44e69cb8906b5a0bb5b2cc0695697/display/?preview=True) with very large number of columns. To restrict the number of columns to a manageable set let's use this command: `SELECT rs_ids, aaf_esp_ea, impact, clinvar_disease_name, clinvar_sig FROM variants WHERE filter is NULL and gene = 'POLRMT'` (column definitions can be found [here](https://gemini.readthedocs.org/en/latest/content/database_schema.html)) [Output](https://usegalaxy.org/datasets/bbd44e69cb8906b540d65297cd1d26bb/display/?preview=True) shows variants found within the *POLRMT* gene. @@ -301,7 +301,7 @@ Wildcards simply writing SQL expressions when searching across multiple terms. T > >
    > Click to view answer -> Type `SELECT chrom, start, end, ref, alt, gene, impact, (gts).(*) FROM variants` into The query to be issued to the database and `(gt_types).(*).(==HET).(all)` into Restrictions to apply to genotype values. Here we use wildcards for the query (`(gts.*)` = get genotypes for all samples) and genotype filtering (`(gt_types).(*).(==HET).(all)`, the all operator implies that want results for all afftected individuals). It will generate this output. +> Type `SELECT chrom, start, end, ref, alt, gene, impact, (gts).(*) FROM variants` into The query to be issued to the database and `(gt_types).(*).(==HET).(all)` into Restrictions to apply to genotype values. Here we use wildcards for the query (`(gts.*)` = get genotypes for all samples) and genotype filtering (`(gt_types).(*).(==HET).(all)`, the all operator implies that want results for all afftected individuals). It will generate this output. >
    {: .question} diff --git a/topics/variant-analysis/tutorials/exome-seq/tutorial.md b/topics/variant-analysis/tutorials/exome-seq/tutorial.md index 6e778401..bed3a53f 100644 --- a/topics/variant-analysis/tutorials/exome-seq/tutorial.md +++ b/topics/variant-analysis/tutorials/exome-seq/tutorial.md @@ -179,7 +179,7 @@ to simplify the variant representation. ## Annotate your variants -To annotate the variants, we use the [dbSNP](http://www.ncbi.nlm.nih.gov/SNP/), +To annotate the variants, we use the [dbSNP](https://www.ncbi.nlm.nih.gov/SNP/), the NCBI database of genetic variation and then `hg19` database with **SnpEff**. > ### :pencil2: Hands-on: Annotating variants @@ -262,7 +262,7 @@ relations, additional annotations and most importantly its fast to search. # Variant analysis **GEMINI query** is the most versatile of all the GEMINI tools. You can use it to -ask 'interesting' questions in simple SQL (see the GEMINI [handbook](http://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003153) on its usage). +ask 'interesting' questions in simple SQL (see the GEMINI [handbook](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1003153) on its usage). ### **GEMINI query** examples: - `select chrom, start, end from variants` will show you some information on all From ade2a583f9eac97cb97a7cf6fe092b5ac99fe9c1 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 20:42:47 +0200 Subject: [PATCH 32/35] fix links in READMEs --- topics/admin/README.md | 2 +- topics/assembly/README.md | 2 +- topics/chip-seq/README.md | 2 +- topics/dev/README.md | 2 +- topics/epigenetics/README.md | 2 +- topics/introduction/README.md | 7 +++---- topics/sequence-analysis/README.md | 2 +- topics/training/README.md | 2 +- topics/transcriptomics/README.md | 2 +- topics/variant-analysis/README.md | 2 +- 10 files changed, 12 insertions(+), 13 deletions(-) diff --git a/topics/admin/README.md b/topics/admin/README.md index f15ae42e..d8c47e18 100644 --- a/topics/admin/README.md +++ b/topics/admin/README.md @@ -1,4 +1,4 @@ Admin ===== -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/assembly/README.md b/topics/assembly/README.md index cf5fcc31..30195c21 100644 --- a/topics/assembly/README.md +++ b/topics/assembly/README.md @@ -1,4 +1,4 @@ Assembly ======== -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/chip-seq/README.md b/topics/chip-seq/README.md index dd9624d7..1ca0c99a 100644 --- a/topics/chip-seq/README.md +++ b/topics/chip-seq/README.md @@ -1,4 +1,4 @@ ChIP-seq data analysis ====================== -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/dev/README.md b/topics/dev/README.md index ba5d47b2..3387a83a 100644 --- a/topics/dev/README.md +++ b/topics/dev/README.md @@ -1,4 +1,4 @@ Dev === -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/epigenetics/README.md b/topics/epigenetics/README.md index bd92c91f..3025a63e 100644 --- a/topics/epigenetics/README.md +++ b/topics/epigenetics/README.md @@ -1,4 +1,4 @@ Epigenetic data analysis ======================== -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/introduction/README.md b/topics/introduction/README.md index 36ec11e8..262afe49 100644 --- a/topics/introduction/README.md +++ b/topics/introduction/README.md @@ -15,10 +15,9 @@ A deck of slides is available for this topic: Several tutorials with hands-on are available for this topic: -- [Galaxy Introduction Exercise: From Peaks to Genes](tutorial/introduction.md) -- [Getting to know workflows](tutorial/workflows.md) -- [Processing many samples at once](tutorial/processing_many_samples.md) -- [Using the Integrative Genomics Viewer](./tutorials/igv.md) +- [Galaxy Introduction Exercise: From Peaks to Genes](tutorials/galaxy-intro-peaks2genes/tutorial.md) +- [Processing many samples at once](tutorials/processing-many-samples-at-once/tutorial.md) +- [Using the Integrative Genomics Viewer](/tutorials/igv-introduction/tutorial.md) ## Input datasets diff --git a/topics/sequence-analysis/README.md b/topics/sequence-analysis/README.md index a76c35bb..35033529 100644 --- a/topics/sequence-analysis/README.md +++ b/topics/sequence-analysis/README.md @@ -1,4 +1,4 @@ Sequence Analysis ================= -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/training/README.md b/topics/training/README.md index 2dd90eae..d0904454 100644 --- a/topics/training/README.md +++ b/topics/training/README.md @@ -1,4 +1,4 @@ Train the trainers ========== -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material diff --git a/topics/transcriptomics/README.md b/topics/transcriptomics/README.md index 815a578d..070e9a8f 100644 --- a/topics/transcriptomics/README.md +++ b/topics/transcriptomics/README.md @@ -23,7 +23,7 @@ For ref-based tutorial, the original data is available at NCBI Gene Expression O ## Galaxy instance -For these tutorials, you can use the [dedicated Docker image](docker/README.md): +For these tutorials, you can use the [dedicated Docker image](docker/Dockerfile): ``` docker run -d -p 8080:80 bgruening/galaxy-rna-seq-training diff --git a/topics/variant-analysis/README.md b/topics/variant-analysis/README.md index 9c13eb9b..b09cb223 100644 --- a/topics/variant-analysis/README.md +++ b/topics/variant-analysis/README.md @@ -1,4 +1,4 @@ Variant analysis ================ -Please refer to the [CONTRIBUTING.md](../CONTRIBUTING.md) before adding or updating any material \ No newline at end of file +Please refer to the [CONTRIBUTING.md](../../CONTRIBUTING.md) before adding or updating any material From 9a497960b8f433d89f98dc365ece5f8f4feb9804 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 22:50:16 +0200 Subject: [PATCH 33/35] don't use https on localhost --- topics/dev/tutorials/architecture/slides.html | 2 +- topics/dev/tutorials/interactive-environments/slides.html | 4 ++-- topics/dev/tutorials/tool-integration/slides.html | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/topics/dev/tutorials/architecture/slides.html b/topics/dev/tutorials/architecture/slides.html index ce95d018..6440a629 100644 --- a/topics/dev/tutorials/architecture/slides.html +++ b/topics/dev/tutorials/architecture/slides.html @@ -1320,7 +1320,7 @@ .code[``` galaxy.queue_worker INFO 2016-06-23 19:13:39,049 Binding and starting galaxy control worker for main Starting server in PID 21102. -serving on https://127.0.0.1:8080 +serving on http://127.0.0.1:8080 ```] --- diff --git a/topics/dev/tutorials/interactive-environments/slides.html b/topics/dev/tutorials/interactive-environments/slides.html index 93217c83..419515cc 100644 --- a/topics/dev/tutorials/interactive-environments/slides.html +++ b/topics/dev/tutorials/interactive-environments/slides.html @@ -423,8 +423,8 @@ # Note the trailing slash used everywhere! location PROXY_PREFIX/helloworld/ { proxy_buffering off; - proxy_pass https://127.0.0.1:8000/; - proxy_redirect https://127.0.0.1:8000/ PROXY_PREFIX/helloworld/; + proxy_pass http://127.0.0.1:8000/; + proxy_redirect http://127.0.0.1:8000/ PROXY_PREFIX/helloworld/; } } ``` diff --git a/topics/dev/tutorials/tool-integration/slides.html b/topics/dev/tutorials/tool-integration/slides.html index 9572099e..8981c8f1 100644 --- a/topics/dev/tutorials/tool-integration/slides.html +++ b/topics/dev/tutorials/tool-integration/slides.html @@ -585,7 +585,7 @@ $ planemo serve ``` -Open https://127.0.0.1:9090/ in your web browser to view your new tool +Open http://127.0.0.1:9090/ in your web browser to view your new tool --- @@ -1389,7 +1389,7 @@ ```yaml api_key: admin -galaxy_instance: https://127.0.0.1:8080/ +galaxy_instance: http://127.0.0.1:8080/ tools: - name: fastqc owner: devteam From f1495387eee6a5ea5efb09f40083037a0910b073 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 23:01:50 +0200 Subject: [PATCH 34/35] fix more links --- topics/usegalaxy/tutorials/ngs/tutorial.md | 6 +++--- topics/usegalaxy/tutorials/non-dip/tutorial.md | 2 +- topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/topics/usegalaxy/tutorials/ngs/tutorial.md b/topics/usegalaxy/tutorials/ngs/tutorial.md index 08ee33d0..1e2505d1 100644 --- a/topics/usegalaxy/tutorials/ngs/tutorial.md +++ b/topics/usegalaxy/tutorials/ngs/tutorial.md @@ -53,7 +53,7 @@ These are paired end data (see below for explanation of what paired-end is) for ## What is Fastq? -[FastQ](https://en.wikipedia.org/wiki/FASTQ_format) is not a very well defined format. In the beginning various manufacturers of sequencing instruments were free to interpret fastq as they saw fit, resulting in a multitude of fastq flavors. This variation stemmed primarily from different ways of encoding quality values as described [here](http://en.wikipedia.org/wiki/FASTQ_format) (below you will explanation of quality scores and their meaning). Today, [fastq Sanger](http://www.ncbi.nlm.nih.gov/pubmed/20015970) version of the format is considered to be the standard form of fastq. Galaxy is using fastq sanger as the only legitimate input for downstream processing tools and provides [a number of utilities for converting fastq files](http://www.ncbi.nlm.nih.gov/pubmed/20562416) into this form (see **NGS: QC and manipulation** section of Galaxy tools). +[FastQ](https://en.wikipedia.org/wiki/FASTQ_format) is not a very well defined format. In the beginning various manufacturers of sequencing instruments were free to interpret fastq as they saw fit, resulting in a multitude of fastq flavors. This variation stemmed primarily from different ways of encoding quality values as described [here](https://en.wikipedia.org/wiki/FASTQ_format) (below you will explanation of quality scores and their meaning). Today, [fastq Sanger](https://www.ncbi.nlm.nih.gov/pubmed/20015970) version of the format is considered to be the standard form of fastq. Galaxy is using fastq sanger as the only legitimate input for downstream processing tools and provides [a number of utilities for converting fastq files](https://www.ncbi.nlm.nih.gov/pubmed/20562416) into this form (see **NGS: QC and manipulation** section of Galaxy tools). Fastq format looks like this: @@ -206,8 +206,8 @@ Mapping of NGS reads against reference sequences is one of the key steps of the - 2009 Bowtie 1 - [Langmead et al.](https://genomebiology.com/content/10/3/R25) - 2012 Bowtie 2 - [Langmead and Salzberg](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3322381/) -- 2009 BWA - [Li and Durbin](https://bioinformatics.oxfordjournals.org/content/25/14/1754.long) -- 2010 BWA - [Li and Durbin](https://bioinformatics.oxfordjournals.org/content/26/5/589) +- 2009 BWA - [Li and Durbin](hhttps://academic.oup.com/bioinformatics/article/25/14/1754/225615/Fast-and-accurate-short-read-alignment-with) +- 2010 BWA - [Li and Durbin](https://academic.oup.com/bioinformatics/article/26/5/589/211735/Fast-and-accurate-long-read-alignment-with-Burrows) - 2013 BWA-MEM - [Li](https://arxiv.org/abs/1303.3997) ## Mapping against a pre-computed genome index diff --git a/topics/usegalaxy/tutorials/non-dip/tutorial.md b/topics/usegalaxy/tutorials/non-dip/tutorial.md index b97b6592..81ac1634 100644 --- a/topics/usegalaxy/tutorials/non-dip/tutorial.md +++ b/topics/usegalaxy/tutorials/non-dip/tutorial.md @@ -137,7 +137,7 @@ In other words the two datasets had ~6% and ~9% duplicates, respectively. # Left-aligning indels -Left aligning of indels (a variant of re-aligning) is extremely important for obtaining accurate variant calls. This concept, while not difficult, requires some explanation. For illustrating how left-aligning works we expanded on an example provided by [Tan:2015](https://bioinformatics.oxfordjournals.org/content/31/13/2202.abstract). Suppose you have a reference sequence and a sequencing read: +Left aligning of indels (a variant of re-aligning) is extremely important for obtaining accurate variant calls. This concept, while not difficult, requires some explanation. For illustrating how left-aligning works we expanded on an example provided by [Tan:2015](https://academic.oup.com/bioinformatics/article/31/13/2202/196142/Unified-representation-of-genetic-variants). Suppose you have a reference sequence and a sequencing read: ``` diff --git a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md index 582c93e7..7f322f60 100644 --- a/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md +++ b/topics/usegalaxy/tutorials/rb-rnaseq/tutorial.md @@ -144,7 +144,7 @@ To further optimize and speed up spliced read alignment Kim at al. [2015](https: ### STAR mapper -[STAR aligner](https://github.com/alexdobin/STAR) is a fast alternative for mapping RNAseq reads against genome utilizing uncompressed [suffix array](https://en.wikipedia.org/wiki/Suffix_array). It operates in [two stages](https://bioinformatics.oxfordjournals.org/content/early/2012/10/25/bioinformatics.bts635.abstract). In the first stage it performs seed search: +[STAR aligner](https://github.com/alexdobin/STAR) is a fast alternative for mapping RNAseq reads against genome utilizing uncompressed [suffix array](https://en.wikipedia.org/wiki/Suffix_array). It operates in [two stages](https://academic.oup.com/bioinformatics/article/29/1/15/272537/STAR-ultrafast-universal-RNA-seq-aligner). In the first stage it performs seed search: >![](../../images/star.png) > From bc4628c5e75b492fa0f3a2ec1ee89936135fef36 Mon Sep 17 00:00:00 2001 From: Saskia Hiltemann Date: Fri, 7 Jul 2017 23:06:17 +0200 Subject: [PATCH 35/35] oops --- topics/usegalaxy/tutorials/ngs/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topics/usegalaxy/tutorials/ngs/tutorial.md b/topics/usegalaxy/tutorials/ngs/tutorial.md index 1e2505d1..7b13fd2e 100644 --- a/topics/usegalaxy/tutorials/ngs/tutorial.md +++ b/topics/usegalaxy/tutorials/ngs/tutorial.md @@ -206,7 +206,7 @@ Mapping of NGS reads against reference sequences is one of the key steps of the - 2009 Bowtie 1 - [Langmead et al.](https://genomebiology.com/content/10/3/R25) - 2012 Bowtie 2 - [Langmead and Salzberg](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3322381/) -- 2009 BWA - [Li and Durbin](hhttps://academic.oup.com/bioinformatics/article/25/14/1754/225615/Fast-and-accurate-short-read-alignment-with) +- 2009 BWA - [Li and Durbin](https://academic.oup.com/bioinformatics/article/25/14/1754/225615/Fast-and-accurate-short-read-alignment-with) - 2010 BWA - [Li and Durbin](https://academic.oup.com/bioinformatics/article/26/5/589/211735/Fast-and-accurate-long-read-alignment-with-Burrows) - 2013 BWA-MEM - [Li](https://arxiv.org/abs/1303.3997)