This document illustrates how to use genome-sampler
on a small tutorial data set using step-by-step instructions.
This gives you complete control over the analysis, but is more complex to run relative to using the genome-sampler Snakemake workflow as illustrated in Snakemake tutorial.
Download tutorial data¶
The tutorial data set used here is intended for educational purposes only. If you’re interested in using these sequences for other analyses, we recommend starting with sequence repositories such as GISAID or NCBI Genbank, which have much more recent versions.
Download the tutorial sequences and corresponding metadata using the following commands:
wget -O 'context-metadata.tsv' \
'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-metadata.tsv'
wget -O 'focal-metadata.tsv' \
'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-metadata.tsv'
from qiime2 import Metadata
from urllib import request
url = 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-metadata.tsv'
fn = 'context-metadata.tsv'
request.urlretrieve(url, fn)
context_metadata_md = Metadata.load(fn)
url = 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-metadata.tsv'
fn = 'focal-metadata.tsv'
request.urlretrieve(url, fn)
focal_metadata_md = Metadata.load(fn)
library(reticulate)
Metadata <- import("qiime2")$Metadata
request <- import("urllib")$request
url <- 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-metadata.tsv'
fn <- 'context-metadata.tsv'
request$urlretrieve(url, fn)
context_metadata_md <- Metadata$load(fn)
url <- 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-metadata.tsv'
fn <- 'focal-metadata.tsv'
request$urlretrieve(url, fn)
focal_metadata_md <- Metadata$load(fn)
context_metadata = use.init_metadata_from_url(
'context-metadata',
'https://raw.githubusercontent.com/caporaso-lab/genome-sampler/r2020.8/snakemake/tutorial-data/context-metadata.tsv')
focal_metadata = use.init_metadata_from_url(
'focal-metadata',
'https://raw.githubusercontent.com/caporaso-lab/genome-sampler/r2020.8/snakemake/tutorial-data/focal-metadata.tsv')
wget -O 'context-seqs-raw.fasta' \
'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-seqs-raw.fasta'
wget -O 'focal-seqs-raw.fasta' \
'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-seqs-raw.fasta'
url = 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-seqs-raw.fasta'
fn = 'context-seqs-raw.fasta'
request.urlretrieve(url, fn)
url = 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-seqs-raw.fasta'
fn = 'focal-seqs-raw.fasta'
request.urlretrieve(url, fn)
url <- 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/context-seqs-raw.fasta'
fn <- 'context-seqs-raw.fasta'
request$urlretrieve(url, fn)
url <- 'https://genome-sampler.readthedocs.io/en/latest/data/tutorial/focal-seqs-raw.fasta'
fn <- 'focal-seqs-raw.fasta'
request$urlretrieve(url, fn)
def fasta_factory(url):
import urllib.request
import tempfile
from io import TextIOWrapper
from genome_sampler.common import GISAIDDNAFASTAFormat
from qiime2.plugin.util import transform
data = urllib.request.urlopen(url)
ff = GISAIDDNAFASTAFormat()
with ff.open() as fh:
fh.write(TextIOWrapper(data).read())
return ff
def context_fasta_factory():
url = 'https://raw.githubusercontent.com/caporaso-lab/genome-sampler/r2020.8/snakemake/tutorial-data/context-seqs.fasta'
return fasta_factory(url)
context_seqs_raw = use.init_format('context-seqs-raw', context_fasta_factory, ext='fasta')
def focal_fasta_factory():
url = 'https://raw.githubusercontent.com/caporaso-lab/genome-sampler/r2020.8/snakemake/tutorial-data/focal-seqs.fasta'
return fasta_factory(url)
focal_seqs_raw = use.init_format('focal-seqs-raw', focal_fasta_factory, ext='fasta')
Importing¶
You’ll begin the workflow by importing the fasta files into QIIME 2 Artifacts. QIIME 2 Artifacts are structured zip files which will contain the data (still in fasta format), but also some QIIME 2-specific information that (among other things) automates your method recording so you don’t have to do that.
qiime tools import \
--type 'FeatureData[Sequence]' \
--input-format GISAIDDNAFASTAFormat \
--input-path context-seqs-raw.fasta \
--output-path context-seqs.qza
qiime tools import \
--type 'FeatureData[Sequence]' \
--input-format GISAIDDNAFASTAFormat \
--input-path focal-seqs-raw.fasta \
--output-path focal-seqs.qza
from qiime2 import Artifact
context_seqs = Artifact.import_data(
'FeatureData[Sequence]',
'context-seqs-raw.fasta',
'GISAIDDNAFASTAFormat',
)
focal_seqs = Artifact.import_data(
'FeatureData[Sequence]',
'focal-seqs-raw.fasta',
'GISAIDDNAFASTAFormat',
)
Artifact <- import("qiime2")$Artifact
context_seqs <- Artifact$import_data(
'FeatureData[Sequence]',
'context-seqs-raw.fasta',
'GISAIDDNAFASTAFormat',
)
focal_seqs <- Artifact$import_data(
'FeatureData[Sequence]',
'focal-seqs-raw.fasta',
'GISAIDDNAFASTAFormat',
)
context_seqs = use.import_from_format('context_seqs', 'FeatureData[Sequence]', context_seqs_raw, 'GISAIDDNAFASTAFormat')
focal_seqs = use.import_from_format('focal_seqs', 'FeatureData[Sequence]', focal_seqs_raw, 'GISAIDDNAFASTAFormat')
If you’re obtaining context sequences from a public repository, you may encounter context sequences that don’t have associated metadata records. Steps in this workflow that require context sequence metadata would fail as a result. At this stage we therefore filter any context sequences that don’t have metadata records.
qiime feature-table filter-seqs \
--i-data context-seqs.qza \
--m-metadata-file context-metadata.tsv \
--o-filtered-data context-seqs-w-metadata.qza
import qiime2.plugins.feature_table.actions as feature_table_actions
context_seqs_w_metadata, = feature_table_actions.filter_seqs(
data=context_seqs,
metadata=context_metadata_md,
)
feature_table_actions <- import("qiime2.plugins.feature_table.actions")
action_results <- feature_table_actions$filter_seqs(
data=context_seqs,
metadata=context_metadata_md,
)
context_seqs_w_metadata <- action_results$filtered_data
context_seqs_w_metadata, = use.action(
use.UsageAction(plugin_id='feature_table',
action_id='filter_seqs'),
use.UsageInputs(data=context_seqs, metadata=context_metadata),
use.UsageOutputNames(filtered_data='context_seqs_w_metadata')
)
Quality filtering¶
Next, we’ll apply a quality filter to the sequence data.
Technically this is an optional step for both the context and focal sequences, but in practice if you obtain your context sequences from a public repository you should consider the context sequence filtering to be essential.
You can use this to remove sequences based on their length (if they’re too short or too long), and based on the proportion of ambiguous (e.g., N
) nucleotide characters that they contain.
Here we’ll apply this both to the context sequences and the focal sequences.
We’ll remove sequences with a proportion of 1% or more ambiguous characters.
qiime genome-sampler filter-seqs \
--i-sequences context-seqs-w-metadata.qza \
--p-max-proportion-ambiguous 0.01 \
--o-filtered-sequences filtered-context-seqs.qza
qiime genome-sampler filter-seqs \
--i-sequences focal-seqs.qza \
--p-max-proportion-ambiguous 0.01 \
--o-filtered-sequences filtered-focal-seqs.qza
import qiime2.plugins.genome_sampler.actions as genome_sampler_actions
filtered_context_seqs, = genome_sampler_actions.filter_seqs(
sequences=context_seqs_w_metadata,
max_proportion_ambiguous=0.01,
)
filtered_focal_seqs, = genome_sampler_actions.filter_seqs(
sequences=focal_seqs,
max_proportion_ambiguous=0.01,
)
genome_sampler_actions <- import("qiime2.plugins.genome_sampler.actions")
action_results <- genome_sampler_actions$filter_seqs(
sequences=context_seqs_w_metadata,
max_proportion_ambiguous=0.01,
)
filtered_context_seqs <- action_results$filtered_sequences
action_results <- genome_sampler_actions$filter_seqs(
sequences=focal_seqs,
max_proportion_ambiguous=0.01,
)
filtered_focal_seqs <- action_results$filtered_sequences
filtered_context_seqs, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='filter_seqs'),
use.UsageInputs(sequences=context_seqs_w_metadata,
max_proportion_ambiguous=0.01),
use.UsageOutputNames(filtered_sequences='filtered_context_seqs')
)
filtered_focal_seqs, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='filter_seqs'),
use.UsageInputs(sequences=focal_seqs,
max_proportion_ambiguous=0.01),
use.UsageOutputNames(filtered_sequences='filtered_focal_seqs')
)
At any time, you could get some summary information about your sequences using the following command:
qiime feature-table tabulate-seqs \
--i-data filtered-focal-seqs.qza \
--o-visualization filtered-focal-seqs-summary.qzv
filtered_focal_seqs_summary_viz, = feature_table_actions.tabulate_seqs(
data=filtered_focal_seqs,
)
action_results <- feature_table_actions$tabulate_seqs(
data=filtered_focal_seqs,
)
filtered_focal_seqs_summary_viz <- action_results$visualization
filtered_focal_seqs_summary, = use.action(
use.UsageAction(plugin_id='feature_table',
action_id='tabulate_seqs'),
use.UsageInputs(data=filtered_focal_seqs),
use.UsageOutputNames(visualization='filtered_focal_seqs_summary')
)
That command will create a QIIME 2 visualization, which most frequently would be viewed using QIIME 2 View. Try viewing that file and finding information such as the number of sequences present in this file and the median length of the sequences. (Your data is not uploaded to a server when you visit QIIME 2 View, so you don’t need to be concerned about exposing sensitive research data.) For additional information on how to view QIIME 2 visualizations, see here.
Sampling genomes¶
We’re now ready to start sampling our data, and we’ll do this in three steps. These subsampling steps are independent, so can be run in any order.
Sampling across time ⏱️¶
First, we’ll sample across time. By default, this will select seven genomes per seven day period. The file that gets generated as a result here isn’t something that is very useful to view directly, but we’ll use it in a few minutes to see how many sequences were retained at this step.
qiime genome-sampler sample-longitudinal \
--i-context-seqs filtered-context-seqs.qza \
--m-dates-file context-metadata.tsv \
--m-dates-column date \
--o-selection date-selection.qza
date_mdc = context_metadata_md.get_column('date')
date_selection, = genome_sampler_actions.sample_longitudinal(
context_seqs=filtered_context_seqs,
dates=date_mdc,
)
date_mdc <- context_metadata_md$get_column('date')
action_results <- genome_sampler_actions$sample_longitudinal(
context_seqs=filtered_context_seqs,
dates=date_mdc,
)
date_selection <- action_results$selection
dates = use.get_metadata_column('date', 'date', context_metadata)
date_selection, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='sample_longitudinal'),
use.UsageInputs(context_seqs=filtered_context_seqs,
dates=dates),
use.UsageOutputNames(selection='date_selection')
)
Sampling across biological diversity 🌲¶
Next, we’ll sample across viral diversity. This will cluster sequences at a percent identity threshold of 99.95%, and select the centroid sequence from each cluster to include in downstream analyses.
qiime genome-sampler sample-diversity \
--i-context-seqs filtered-context-seqs.qza \
--p-percent-id 0.9995 \
--o-selection diversity-selection.qza
diversity_selection, = genome_sampler_actions.sample_diversity(
context_seqs=filtered_context_seqs,
percent_id=0.9995,
)
action_results <- genome_sampler_actions$sample_diversity(
context_seqs=filtered_context_seqs,
percent_id=0.9995,
)
diversity_selection <- action_results$selection
diversity_selection, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='sample_diversity'),
use.UsageInputs(context_seqs=filtered_context_seqs,
percent_id=0.9995),
use.UsageOutputNames(selection='diversity_selection')
)
Sampling across location of isolation 🗺️¶
Last, we’ll sample near neighbors of the focal sequences from the context sequences.
Of the near-neighbor sequences that are identified for each focal sequence (up to 10 by default), 3 sequences will be selected at random such that each location
(defined in the context-metadata.tsv
file) has an equal probability of being selected for inclusion in downstream analysis.
qiime genome-sampler sample-neighbors \
--i-focal-seqs filtered-focal-seqs.qza \
--i-context-seqs filtered-context-seqs.qza \
--m-locale-file context-metadata.tsv \
--m-locale-column location \
--p-percent-id 0.9999 \
--p-samples-per-cluster 3 \
--o-selection neighbor-selection.qza
location_mdc = context_metadata_md.get_column('location')
neighbor_selection, = genome_sampler_actions.sample_neighbors(
focal_seqs=filtered_focal_seqs,
context_seqs=filtered_context_seqs,
locale=location_mdc,
percent_id=0.9999,
samples_per_cluster=3,
)
location_mdc <- context_metadata_md$get_column('location')
action_results <- genome_sampler_actions$sample_neighbors(
focal_seqs=filtered_focal_seqs,
context_seqs=filtered_context_seqs,
locale=location_mdc,
percent_id=0.9999,
samples_per_cluster=3L,
)
neighbor_selection <- action_results$selection
locale = use.get_metadata_column('location', 'location', context_metadata)
neighbor_selection, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='sample_neighbors'),
use.UsageInputs(focal_seqs=filtered_focal_seqs,
context_seqs=filtered_context_seqs,
locale=locale,
percent_id=0.9999,
samples_per_cluster=3),
use.UsageOutputNames(selection='neighbor_selection')
)
Combine and summarize samples¶
Now, we’ll combine the results of the three sampling approaches:
qiime genome-sampler combine-selections \
--i-selections date-selection.qza diversity-selection.qza neighbor-selection.qza \
--o-combined-selection master-selection.qza
master_selection, = genome_sampler_actions.combine_selections(
selections=[date_selection, diversity_selection, neighbor_selection],
)
action_results <- genome_sampler_actions$combine_selections(
selections=list(date_selection, diversity_selection, neighbor_selection),
)
master_selection <- action_results$combined_selection
master_selection, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='combine_selections'),
use.UsageInputs(selections=[date_selection, diversity_selection, neighbor_selection]),
use.UsageOutputNames(combined_selection='master_selection')
)
And we’ll generate a summary of the full selection process:
qiime genome-sampler summarize-selections \
--i-selections date-selection.qza diversity-selection.qza neighbor-selection.qza \
--o-visualization selection-summary.qzv
selection_summary_viz, = genome_sampler_actions.summarize_selections(
selections=[date_selection, diversity_selection, neighbor_selection],
)
action_results <- genome_sampler_actions$summarize_selections(
selections=list(date_selection, diversity_selection, neighbor_selection),
)
selection_summary_viz <- action_results$visualization
selection_summary, = use.action(
use.UsageAction(plugin_id='genome_sampler',
action_id='summarize_selections'),
use.UsageInputs(selections=[date_selection, diversity_selection, neighbor_selection]),
use.UsageOutputNames(visualization='selection_summary')
)
The selection-summary.qzv
file provides a summary of the sampling run.
You can view this file using QIIME 2 View.
Be sure to click the Provenance tab after loading the file on that page - that will provide full details on the workflow that was executed.
How many sequences were retained by each sampling step?
Preparing sampled data for downstream applications¶
We’re now ready to start compiling our final data set.
To do this, we’ll use the master-selection.qza
file to select specific context sequences from the filtered-context-seqs.qza
file that was generated earlier.
qiime feature-table filter-seqs \
--i-data filtered-context-seqs.qza \
--m-metadata-file master-selection.qza \
--o-filtered-data subsampled-context-seqs.qza
__md = master_selection.view(Metadata)
subsampled_context_seqs, = feature_table_actions.filter_seqs(
data=filtered_context_seqs,
metadata=__md,
)
__md <- master_selection$view(Metadata)
action_results <- feature_table_actions$filter_seqs(
data=filtered_context_seqs,
metadata=__md,
)
subsampled_context_seqs <- action_results$filtered_data
subsampled_context_seqs, = use.action(
use.UsageAction(plugin_id='feature_table',
action_id='filter_seqs'),
use.UsageInputs(data=filtered_context_seqs,
metadata=use.view_as_metadata('_', master_selection)),
use.UsageOutputNames(filtered_data='subsampled_context_seqs')
)
Next, merge the resulting subsampled context sequences with the focal sequences to create the final QIIME 2 artifact.
qiime feature-table merge-seqs \
--i-data subsampled-context-seqs.qza filtered-focal-seqs.qza \
--o-merged-data sequences.qza
sequences, = feature_table_actions.merge_seqs(
data=[subsampled_context_seqs, filtered_focal_seqs],
)
action_results <- feature_table_actions$merge_seqs(
data=list(subsampled_context_seqs, filtered_focal_seqs),
)
sequences <- action_results$merged_data
sequences, = use.action(
use.UsageAction(plugin_id='feature_table',
action_id='merge_seqs'),
use.UsageInputs(data=[subsampled_context_seqs, filtered_focal_seqs]),
use.UsageOutputNames(merged_data='sequences')
)
Exporting data¶
Optional: alignment and phylogenetic reconstruction with QIIME 2¶
QIIME 2 contains tools for preliminary sequence alignment and phylogenetic tree generation in the q2-alignment and q2-phylogeny plugins. QIIME 2 is not designed for high-accuracy phylogenetic analysis, but it can help you to generate and visualize initial alignments and trees.
These steps would take the sequences.qza
file as input, so if you’re interested in using these you could postpone or skip the export step that you ran above.
If you’d like to learn more about this, refer to Downstream alignment and phylogenetics.