Skip to article frontmatterSkip to article content

Formats

genome-sampler


IDSelectionDirFmt

plugin: genome-sampler
type: directory
files:
namepathformatrequired
includedincluded.txtUNIXListFormatTrue
excludedexcluded.txtUNIXListFormatTrue
metadatametadata.tsvIDMetadataFormatTrue
labellabel.txtUNIXListFormatTrue

GISAIDDNAFASTAFormat

type: text file

VCFMaskFormat

type: text file

VCFMaskDirFmt

plugin: genome-sampler
type: directory
files:
namepathformatrequired
filemask.tsvVCFMaskFormatTrue

metadata


ArtificialGroupingFormat

type: text file

ArtificialGroupingDirectoryFormat

plugin: metadata
type: directory
files:
namepathformatrequired
fileartificial-groupings.tsvArtificialGroupingFormatTrue

types


Bowtie2IndexDirFmt

plugin: types
type: directory
files:
namepathformatrequired
idx1.+(?<!\.rev)\.1\.bt2l?Bowtie2IndexFileFormatTrue
idx2.+(?<!\.rev)\.2\.bt2l?Bowtie2IndexFileFormatTrue
ref3.+\.3\.bt2l?Bowtie2IndexFileFormatTrue
ref4.+\.4\.bt2l?Bowtie2IndexFileFormatTrue
rev1.+\.rev\.1\.bt2l?Bowtie2IndexFileFormatTrue
rev2.+\.rev\.2\.bt2l?Bowtie2IndexFileFormatTrue

LSMatFormat

type: text file

DistanceMatrixDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filedistance-matrix.tsvLSMatFormatTrue

TSVTaxonomyFormat

Format for a 2+ column TSV file with an expected minimal header.

The only header recognized by this format is:

Feature ID<tab>Taxon

Optionally followed by other arbitrary columns.

This format supports blank lines. The expected header must be the first non-blank line. In addition to the header, there must be at least one line of data.

type: text file

TSVTaxonomyDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filetaxonomy.tsvTSVTaxonomyFormatTrue

HeaderlessTSVTaxonomyFormat

Format for a 2+ column TSV file without a header.

This format supports comment lines starting with #, and blank lines.

type: text file

HeaderlessTSVTaxonomyDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filetaxonomy.tsvHeaderlessTSVTaxonomyFormatTrue

TaxonomyFormat

Legacy format for any 2+ column TSV file, with or without a header.

This format has been superseded by taxonomy file formats explicitly with and without headers, TSVTaxonomyFormat and HeaderlessTSVTaxonomyFormat, respectively.

This format remains in place for backwards-compatibility. Transformers are intentionally not hooked up to transform this format into the canonical .qza format (TSVTaxonomyFormat) to prevent users from importing data in this format. Transformers will remain in place to transform this format into in-memory Python objects (e.g. pd.Series) so that existing .qza files can still be loaded and processed.

The only header recognized by this format is:

Feature ID<tab>Taxon

Optionally followed by other arbitrary columns.

If this header isn't present, the format is assumed to be headerless.

This format supports comment lines starting with #, and blank lines.

type: text file

TaxonomyDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filetaxonomy.tsvTaxonomyFormatTrue

FASTAFormat

type: text file

DNAFASTAFormat

type: text file

DNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filedna-sequences.fastaDNAFASTAFormatTrue

PairedDNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
left_dna_sequencesleft-dna-sequences.fastaDNAFASTAFormatTrue
right_dna_sequencesright-dna-sequences.fastaDNAFASTAFormatTrue

AlignedDNAFASTAFormat

type: text file

AlignedDNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-dna-sequences.fastaAlignedDNAFASTAFormatTrue

DifferentialFormat

type: text file

DifferentialDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filedifferentials.tsvDifferentialFormatTrue

ProteinFASTAFormat

type: text file

AlignedProteinFASTAFormat

type: text file

MixedCaseProteinFASTAFormat

type: text file

MixedCaseAlignedProteinFASTAFormat

type: text file

ProteinSequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
fileprotein-sequences.fastaProteinFASTAFormatTrue

AlignedProteinSequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-protein-sequences.fastaAlignedProteinFASTAFormatTrue

MixedCaseProteinSequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
fileprotein-sequences.fastaMixedCaseProteinFASTAFormatTrue

MixedCaseAlignedProteinSequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-protein-sequences.fastaMixedCaseAlignedProteinFASTAFormatTrue

RNAFASTAFormat

type: text file

RNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filerna-sequences.fastaRNAFASTAFormatTrue

AlignedRNAFASTAFormat

type: text file

AlignedRNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-rna-sequences.fastaAlignedRNAFASTAFormatTrue

PairedRNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
left_rna_sequencesleft-rna-sequences.fastaRNAFASTAFormatTrue
right_rna_sequencesright-rna-sequences.fastaRNAFASTAFormatTrue

BLAST6Format

type: text file

BLAST6DirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
fileblast6.tsvBLAST6FormatTrue

MixedCaseDNAFASTAFormat

type: text file

MixedCaseDNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filedna-sequences.fastaMixedCaseDNAFASTAFormatTrue

MixedCaseRNAFASTAFormat

type: text file

MixedCaseRNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filerna-sequences.fastaMixedCaseRNAFASTAFormatTrue

MixedCaseAlignedDNAFASTAFormat

type: text file

MixedCaseAlignedDNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-dna-sequences.fastaMixedCaseAlignedDNAFASTAFormatTrue

MixedCaseAlignedRNAFASTAFormat

type: text file

MixedCaseAlignedRNASequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealigned-rna-sequences.fastaMixedCaseAlignedRNAFASTAFormatTrue

SequenceCharacteristicsFormat

Format for a TSV file with information about sequences like length of a feature. The first column contains feature identifiers and is followed by other optional columns.

The file cannot be empty and must have at least two columns.

Validation for additional columns can be added with a semantic validator tied to a property. For example the "validate_seq_char_len" validator for "FeatureData[SequenceCharacteristics % Properties("length")]" adds validation for a numerical column called "length".

type: text file

SequenceCharacteristicsDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filesequence_characteristics.tsvSequenceCharacteristicsFormatTrue

MAGSequencesDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-4[0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}\.(fa|fasta)$DNAFASTAFormatTrue

MAGtoContigsFormat

type: text file

MAGtoContigsDirFmt

plugin: types
type: directory
files:
namepathformatrequired
filemag-to-contigs.jsonMAGtoContigsFormatTrue

BIOMV100Format

type: text file

BIOMV210Format

type: binary file

BIOMV100DirFmt

plugin: types
type: directory
files:
namepathformatrequired
filefeature-table.biomBIOMV100FormatTrue

BIOMV210DirFmt

plugin: types
type: directory
files:
namepathformatrequired
filefeature-table.biomBIOMV210FormatTrue

GenesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
genes.+\.(fa|fna|fasta)$DNAFASTAFormatTrue

ProteinsDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
proteins.+\.(fa|faa|fasta)$ProteinFASTAFormatTrue

LociDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
loci.+\.gff$GFF3FormatTrue

GenomeSequencesDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
genomes.+\.(fasta|fa)$DNAFASTAFormatTrue

OrthologFileFmt

type: text file

SeedOrthologDirFmt

plugin: types
type: directory
files:
namepathformatrequired
seed_orthologs.*\..*\.seed_orthologsOrthologFileFmtTrue

OrthologAnnotationDirFmt

plugin: types
type: directory
files:
namepathformatrequired
annotations.+\.annotationsOrthologFileFmtTrue

GFF3Format

Generic Feature Format Version 3 (GFF3) spec: gff3.md NCBI modifications to the above: https://www.ncbi.nlm.nih.gov/datasets/docs/reference-docs/file-formats/about-ncbi-gff3/

type: text file

KaijuDBDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
nodesnodes.dmpNCBITaxonomyNodesFormatTrue
namesnames.dmpNCBITaxonomyNamesFormatTrue
indexkaiju_db.+\.fmiKaijuIndexFormatTrue

KaijuIndexFormat

type: binary file

Kraken2ReportFormat

type: text file

Kraken2OutputFormat

type: text file

Kraken2DBFormat

type: text file

Kraken2DBReportFormat

type: text file

Kraken2ReportDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
reports.+report\.(txt|tsv)$Kraken2ReportFormatTrue

Kraken2OutputDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
reports.+output\.(txt|tsv)$Kraken2OutputFormatTrue

Kraken2DBDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
hashhash.k2dKraken2DBFormatTrue
optsopts.k2dKraken2DBFormatTrue
taxotaxo.k2dKraken2DBFormatTrue

Kraken2DBReportDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filereport.txtKraken2DBReportFormatTrue

BrackenDBFormat

type: text file

BrackenDBDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
kmersdatabase(\d{2,})mers\.kmer_distrib$BrackenDBFormatTrue

ImmutableMetadataFormat

type: text file

ImmutableMetadataDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filemetadata.tsvImmutableMetadataFormatTrue

MultiplexedSingleEndBarcodeInSequenceDirFmt

plugin: types
type: directory
files:
namepathformatrequired
fileforward.fastq.gzFastqGzFormatTrue

MultiplexedPairedEndBarcodeInSequenceDirFmt

plugin: types
type: directory
files:
namepathformatrequired
forward_sequencesforward.fastq.gzFastqGzFormatTrue
reverse_sequencesreverse.fastq.gzFastqGzFormatTrue

MultiplexedFastaQualDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequencesreads.fastaDNAFASTAFormatTrue
qualityreads.qualQualFormatTrue

EMPMultiplexedDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequencessequences.fastq.gzFastqGzFormatTrue
barcodesbarcodes.fastq.gzFastqGzFormatTrue

ErrorCorrectionDetailsDirFmt

plugin: types
type: directory
files:
namepathformatrequired
filedetails.tsvErrorCorrectionDetailsFmtTrue

ErrorCorrectionDetailsFmt

type: text file

EMPSingleEndDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequencessequences.fastq.gzFastqGzFormatTrue
barcodesbarcodes.fastq.gzFastqGzFormatTrue

EMPSingleEndCasavaDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequencesUndetermined_S0_L001_R1_001.fastq.gzFastqGzFormatTrue
barcodesUndetermined_S0_L001_I1_001.fastq.gzFastqGzFormatTrue

EMPPairedEndDirFmt

plugin: types
type: directory
files:
namepathformatrequired
forwardforward.fastq.gzFastqGzFormatTrue
reversereverse.fastq.gzFastqGzFormatTrue
barcodesbarcodes.fastq.gzFastqGzFormatTrue

EMPPairedEndCasavaDirFmt

plugin: types
type: directory
files:
namepathformatrequired
forwardUndetermined_S0_L001_R1_001.fastq.gzFastqGzFormatTrue
reverseUndetermined_S0_L001_R2_001.fastq.gzFastqGzFormatTrue
barcodesUndetermined_S0_L001_I1_001.fastq.gzFastqGzFormatTrue

OrdinationFormat

type: text file

OrdinationDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
fileordination.txtOrdinationFormatTrue

ProcrustesStatisticsFmt

type: text file

ProcrustesStatisticsDirFmt

plugin: types
type: directory
files:
namepathformatrequired
fileProcrustesStatistics.tsvProcrustesStatisticsFmtTrue

FastqManifestFormat

Mapping of sample identifiers to relative filepaths and read direction.

type: text file

FastqAbsolutePathManifestFormat

Mapping of sample identifiers to absolute filepaths and read direction.

type: text file

YamlFormat

Arbitrary yaml-formatted file.

type: text file

FastqGzFormat

A gzipped fastq file.

type: binary file

CasavaOneEightSingleLanePerSampleDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gzFastqGzFormatTrue

CasavaOneEightLanelessPerSampleDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences.+_.+_R[12]_001\.fastq\.gzFastqGzFormatTrue

SingleLanePerSampleSingleEndFastqDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gzFastqGzFormatTrue
manifestMANIFESTFastqManifestFormatTrue
metadatametadata.ymlYamlFormatTrue

SingleLanePerSamplePairedEndFastqDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gzFastqGzFormatTrue
manifestMANIFESTFastqManifestFormatTrue
metadatametadata.ymlYamlFormatTrue

SingleEndFastqManifestPhred33

type: text file

SingleEndFastqManifestPhred64

type: text file

PairedEndFastqManifestPhred33

type: text file

PairedEndFastqManifestPhred64

type: text file

SingleEndFastqManifestPhred33V2

type: text file

SingleEndFastqManifestPhred64V2

type: text file

PairedEndFastqManifestPhred33V2

type: text file

PairedEndFastqManifestPhred64V2

type: text file

QIIME1DemuxFormat

QIIME 1 demultiplexed FASTA format.

The QIIME 1 demultiplexed FASTA format is the default output format of split_libraries.py and split_libraries_fastq.py. The file output by QIIME 1 is named seqs.fna; this filename is sometimes associated with the file format itself due to its widespread usage in QIIME 1.

The format is documented here: http://qiime.org/documentation/file_formats.html#demultiplexed-sequences

Format details:

- FASTA file with exactly two lines per record: header and sequence. Each sequence must span exactly one line and cannot be split across multiple lines.

- The ID in each header must follow the format <sample-id>_<seq-id>. <sample-id> is the identifier of the sample the sequence belongs to, and <seq-id> is an identifier for the sequence within its sample. In QIIME 1, <seq-id> is typically an incrementing integer starting from zero, but any non-empty value can be used here, as long as the header IDs remain unique throughout the file. Note: <sample-id> may contain sample IDs that contain underscores; the rightmost underscore will used to delimit sample and sequence IDs.

- Descriptions in headers are permitted and ignored.

- Header IDs must be unique within the file.

- Each sequence must be DNA and cannot be empty.

type: text file

QIIME1DemuxDirFmt

plugin: types
type: directory
files:
namepathformatrequired
fileseqs.fnaQIIME1DemuxFormatTrue

SampleIdIndexedSingleEndPerSampleDirFmt

Single-end reads in fastq.gz files where base filename is the sample id

The full file name, minus the extension (.fastq.gz) is the sample id. For example, the sample id for the file: * sample-1.fastq.gz is sample-1 * xyz.fastq.gz is xyz * sample-42_S1_L001_R1_001.fastq.gz is sample-42_S1_L001_R1_001

plugin: types
type: directory
files:
namepathformatrequired
sequences.+\.fastq\.gzFastqGzFormatTrue

MultiFASTADirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
sequences.+\.(fa|fasta)$DNAFASTAFormatTrue

MultiMAGSequencesDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences.+\.(fa|fasta)$DNAFASTAFormatTrue
manifestMANIFESTMultiMAGManifestFormatTrue

ContigSequencesDirFmt

plugin: types
type: directory
files:
namepathformatrequired
sequences[^\.].+_contigs.(fasta|fa)$DNAFASTAFormatTrue

MultiBowtie2IndexDirFmt

plugin: types
type: directory
files:
namepathformatrequired
idx1.+(?<!\.rev)\.1\.bt2l?Bowtie2IndexFileFormatTrue
idx2.+(?<!\.rev)\.2\.bt2l?Bowtie2IndexFileFormatTrue
ref3.+\.3\.bt2l?Bowtie2IndexFileFormatTrue
ref4.+\.4\.bt2l?Bowtie2IndexFileFormatTrue
rev1.+\.rev\.1\.bt2l?Bowtie2IndexFileFormatTrue
rev2.+\.rev\.2\.bt2l?Bowtie2IndexFileFormatTrue

BAMFormat

type: binary file

BAMDirFmt

plugin: types
type: directory
files:
namepathformatrequired
bams.+\.bamBAMFormatTrue

MultiBAMDirFmt

plugin: types
type: directory
files:
namepathformatrequired
bams.+\/.+\.bamBAMFormatTrue

MultiMAGManifestFormat

type: text file

ProteinMultipleProfileHmmFileFmt

type: text file

ProteinSingleProfileHmmFileFmt

type: text file

RNAMultipleProfileHmmFileFmt

type: text file

RNASingleProfileHmmFileFmt

type: text file

DNAMultipleProfileHmmFileFmt

type: text file

DNASingleProfileHmmFileFmt

type: text file

PressedProfileHmmsDirectoryFmt

The <hmmfile>.h3m file contains the profile HMMs and their annotation in a binary format. The <hmmfile>.h3i file is an SSI index for the <hmmfile>.h3m file. The <hmmfile>.h3f file contains precomputed data structures for the fast heuristic filter (the MSV filter). The <hmmfile>.h3p file contains precomputed data structures for the rest of each profile.

plugin: types
type: directory
files:
namepathformatrequired
h3m.*\.hmm\.h3mProfileHmmBinaryFileFmtTrue
h3i.*\.hmm\.h3iProfileHmmBinaryFileFmtTrue
h3f.*\.hmm\.h3fProfileHmmBinaryFileFmtTrue
h3p.*\.hmm\.h3pProfileHmmBinaryFileFmtTrue

ProteinSingleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profile.*\.hmmProteinSingleProfileHmmFileFmtTrue

ProteinMultipleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profiles.*\.hmmProteinMultipleProfileHmmFileFmtTrue

DNASingleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profile.*\.hmmDNASingleProfileHmmFileFmtTrue

DNAMultipleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profiles.*\.hmmDNAMultipleProfileHmmFileFmtTrue

RNASingleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profile.*\.hmmRNASingleProfileHmmFileFmtTrue

RNAMultipleProfileHmmDirectoryFmt

plugin: types
type: directory
files:
namepathformatrequired
profiles.*\.hmmRNAMultipleProfileHmmFileFmtTrue

EggnogRefTextFileFmt

type: text file

EggnogRefBinFileFmt

type: binary file

EggnogRefDirFmt

plugin: types
type: directory
files:
namepathformatrequired
eggnogeggnog.*db.*EggnogRefBinFileFmtTrue

DiamondDatabaseFileFmt

type: binary file

DiamondDatabaseDirFmt

plugin: types
type: directory
files:
namepathformatrequired
fileref_db.dmndDiamondDatabaseFileFmtTrue

NCBITaxonomyNodesFormat

type: text file

NCBITaxonomyNamesFormat

type: text file

NCBITaxonomyBinaryFileFmt

type: binary file

NCBITaxonomyDirFmt

plugin: types
type: directory
files:
namepathformatrequired
nodenodes.dmpNCBITaxonomyNodesFormatTrue
namesnames.dmpNCBITaxonomyNamesFormatTrue
tax_mapprot.accession2taxid.gzNCBITaxonomyBinaryFileFmtTrue

EggnogProteinSequencesDirFmt

plugin: types
type: directory
files:
namepathformatrequired
taxid_infoe5.taxid_info.tsvEggnogRefTextFileFmtTrue
proteinse5.proteomes.faaMixedCaseProteinFASTAFormatTrue

AlphaDiversityFormat

type: text file

AlphaDiversityDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filealpha-diversity.tsvAlphaDiversityFormatTrue

NewickFormat

type: text file

NewickDirectoryFormat

plugin: types
type: directory
files:
namepathformatrequired
filetree.nwkNewickFormatTrue