178 lines
7.5 KiB
Text
178 lines
7.5 KiB
Text
|
|
#------------------------------------------------------------------------------
|
|
# $File: bioinformatics,v 1.5 2019/04/19 00:42:27 christos Exp $
|
|
# bioinfomatics: file(1) magic for Bioinfomatics file formats
|
|
|
|
###############################################################################
|
|
# BGZF (Blocked GNU Zip Format) - gzip compatible, but also indexable
|
|
# used by SAMtools bgzip/tabix (http://samtools.sourceforge.net/tabix.shtml)
|
|
###############################################################################
|
|
0 string \037\213
|
|
>3 byte &0x04
|
|
>>12 string BC
|
|
>>>14 leshort &0x02 Blocked GNU Zip Format (BGZF; gzip compatible)
|
|
>>>>16 leshort x \b, block length %d
|
|
!:mime application/x-gzip
|
|
|
|
|
|
###############################################################################
|
|
# Tabix index file
|
|
# used by SAMtools bgzip/tabix (http://samtools.sourceforge.net/tabix.shtml)
|
|
###############################################################################
|
|
0 string TBI\1 SAMtools TBI (Tabix index format)
|
|
>0x04 lelong =1 \b, with %d reference sequence
|
|
>0x04 lelong >1 \b, with %d reference sequences
|
|
>0x08 lelong &0x10000 \b, using half-closed-half-open coordinates (BED style)
|
|
>0x08 lelong ^0x10000
|
|
>>0x08 lelong =0 \b, using closed and one based coordinates (GFF style)
|
|
>>0x08 lelong =1 \b, using SAM format
|
|
>>0x08 lelong =2 \b, using VCF format
|
|
>0x0c lelong x \b, sequence name column: %d
|
|
>0x10 lelong x \b, region start column: %d
|
|
>0x08 lelong =0
|
|
>>0x14 lelong x \b, region end column: %d
|
|
>0x18 byte x \b, comment character: %c
|
|
>0x1c lelong x \b, skip line count: %d
|
|
|
|
|
|
###############################################################################
|
|
# BAM (Binary Sequence Alignment/Map format)
|
|
# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
|
|
# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
|
|
###############################################################################
|
|
0 string BAM\1 SAMtools BAM (Binary Sequence Alignment/Map)
|
|
>0x04 lelong >0
|
|
>>&0x00 regex =^[@]HD\t.*VN: \b, with SAM header
|
|
>>>&0 regex =[0-9.]+ \b version %s
|
|
>>&(0x04) lelong >0 \b, with %d reference sequences
|
|
|
|
|
|
###############################################################################
|
|
# BAI (BAM indexing format)
|
|
# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
|
|
###############################################################################
|
|
0 string BAI\1 SAMtools BAI (BAM indexing format)
|
|
>0x04 lelong >0 \b, with %d reference sequences
|
|
|
|
|
|
###############################################################################
|
|
# CRAM (Binary Sequence Alignment/Map format)
|
|
###############################################################################
|
|
0 string CRAM CRAM
|
|
>0x04 byte >-1 version %d.
|
|
>0x05 byte >-1 \b%d
|
|
>0x06 string >\0 (identified as %s)
|
|
|
|
|
|
###############################################################################
|
|
# BCF (Binary Call Format), version 1
|
|
# used by SAMtools & VCFtools (http://vcftools.sourceforge.net/bcf.pdf)
|
|
# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
|
|
###############################################################################
|
|
0 string BCF\4
|
|
# length of seqnm data in bytes is positive
|
|
>&0x00 lelong >0
|
|
# length of smpl data in bytes is positive
|
|
>>&(&-0x04) lelong >0 SAMtools BCF (Binary Call Format)
|
|
# length of meta in bytes
|
|
>>>&(&-0x04) lelong >0
|
|
# have meta text string
|
|
>>>>&0x00 search ##samtoolsVersion=
|
|
>>>>>&0x00 string x \b, generated by SAMtools version %s
|
|
|
|
|
|
###############################################################################
|
|
# BCF (Binary Call Format), version 2.1
|
|
# used by SAMtools (https://samtools.github.io/hts-specs/BCFv2_qref.pdf)
|
|
# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
|
|
###############################################################################
|
|
0 string BCF\2\1 Binary Call Format (BCF) version 2.1
|
|
# length of header text
|
|
>&0x00 lelong >0
|
|
# have header string
|
|
>>&0x00 search ##samtoolsVersion=
|
|
>>>&0x00 string x \b, generated by SAMtools version %s
|
|
|
|
|
|
###############################################################################
|
|
# BCF (Binary Call Format), version 2.2
|
|
# used by SAMtools (https://samtools.github.io/hts-specs/BCFv2_qref.pdf)
|
|
# data is normally present only within compressed BGZF blocks (CDATA), so use file -z to examine it
|
|
###############################################################################
|
|
0 string BCF\2\2 Binary Call Format (BCF) version 2.2
|
|
# length of header text
|
|
>&0x00 lelong >0
|
|
# have header string
|
|
>>&0x00 search ##samtoolsVersion=
|
|
>>>&0x00 string x \b, generated by SAMtools version %s
|
|
|
|
###############################################################################
|
|
# VCF (Variant Call Format)
|
|
# used by VCFtools (http://vcftools.sourceforge.net/)
|
|
###############################################################################
|
|
0 search ##fileformat=VCFv Variant Call Format (VCF)
|
|
>&0 string x \b version %s
|
|
|
|
###############################################################################
|
|
# FASTQ
|
|
# used by MAQ (http://maq.sourceforge.net/fastq.shtml)
|
|
###############################################################################
|
|
# XXX Broken?
|
|
# @<seqname>
|
|
#0 regex =^@[A-Za-z0-9_.:-]+\?\n
|
|
# <seq>
|
|
#>&1 regex =^[A-Za-z\n.~]++
|
|
# +[<seqname>]
|
|
#>>&1 regex =^[A-Za-z0-9_.:-]*\?\n
|
|
# <qual>
|
|
#>>>&1 regex =^[!-~\n]+\n FASTQ
|
|
|
|
###############################################################################
|
|
# FASTA
|
|
# used by FASTA (https://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf)
|
|
###############################################################################
|
|
#0 byte 0x3e
|
|
# q>0 regex =^[>][!-~\t\ ]+$
|
|
# Amino Acid codes: [A-IK-Z*-]+
|
|
#>>1 regex !=[!-'Jj;:=?@^`|~\\] FASTA
|
|
# IUPAC codes/gaps: [ACGTURYKMSWBDHVNX-]+
|
|
# not in IUPAC codes/gaps: [EFIJLOPQZ]
|
|
#>>>1 regex !=[EFIJLOPQZefijlopqz] \b, with IUPAC nucleotide codes
|
|
#>>>1 regex =^[EFIJLOPQZefijlopqz]+$ \b, with Amino Acid codes
|
|
|
|
###############################################################################
|
|
# SAM (Sequence Alignment/Map format)
|
|
# used by SAMtools (http://samtools.sourceforge.net/SAM1.pdf)
|
|
###############################################################################
|
|
# Short-cut version to recognise SAM files with (optional) header at beginning
|
|
###############################################################################
|
|
0 string @HD\t
|
|
>4 search VN: Sequence Alignment/Map (SAM), with header
|
|
>>&0 regex [0-9.]+ \b version %s
|
|
###############################################################################
|
|
# Longer version to recognise SAM alignment lines using (many) regexes
|
|
###############################################################################
|
|
# SAM Alignment QNAME
|
|
0 regex =^[!-?A-~]{1,255}(\t[^\t]+){11}
|
|
# SAM Alignment FLAG
|
|
>0 regex =^([^\t]+\t){1}[0-9]{1,5}\t
|
|
# SAM Alignment RNAME
|
|
>>0 regex =^([^\t]+\t){2}\\*|[^*=]*\t
|
|
# SAM Alignment POS
|
|
>>>0 regex =^([^\t]+\t){3}[0-9]{1,9}\t
|
|
# SAM Alignment MAPQ
|
|
>>>>0 regex =^([^\t]+\t){4}[0-9]{1,3}\t
|
|
# SAM Alignment CIGAR
|
|
>>>>>0 regex =\t(\\*|([0-9]+[MIDNSHPX=])+)\t
|
|
# SAM Alignment RNEXT
|
|
>>>>>>0 regex =\t(\\*|=|[!-()+->?-~][!-~]*)\t
|
|
# SAM Alignment PNEXT
|
|
>>>>>>>0 regex =^([^\t]+\t){7}[0-9]{1,9}\t
|
|
# SAM Alignment TLEN
|
|
>>>>>>>>0 regex =\t[+-]{0,1}[0-9]{1,9}\t.*\t
|
|
# SAM Alignment SEQ
|
|
>>>>>>>>>0 regex =^([^\t]+\t){9}(\\*|[A-Za-z=.]+)\t
|
|
# SAM Alignment QUAL
|
|
>>>>>>>>>>0 regex =^([^\t]+\t){10}[!-~]+ Sequence Alignment/Map (SAM)
|
|
>>>>>>>>>>>0 regex =^[@]HD\t.*VN: \b, with header
|
|
>>>>>>>>>>>>&0 regex =[0-9.]+ \b version %s
|