209 lines
9.5 KiB
Text
209 lines
9.5 KiB
Text
|
|
||
|
#------------------------------------------------------------------------------
|
||
|
# $File: ispell,v 1.9 2023/07/30 16:02:43 christos Exp $
|
||
|
# ispell: file(1) magic for ispell, MySpell and Hunspell
|
||
|
#
|
||
|
# Ispell 3.0 has a magic of 0x9601 and ispell 3.1 has 0x9602. This magic
|
||
|
# will match 0x9600 through 0x9603 in *both* little endian and big endian.
|
||
|
# (No other current magic entries collide.)
|
||
|
#
|
||
|
# Updated by Daniel Quinlan (quinlan@yggdrasil.com)
|
||
|
#
|
||
|
0 leshort&0xFFFC 0x9600 little endian ispell
|
||
|
>0 byte 0 hash file (?),
|
||
|
>0 byte 1 3.0 hash file,
|
||
|
>0 byte 2 3.1 hash file,
|
||
|
>0 byte 3 hash file (?),
|
||
|
>2 leshort 0x00 8-bit, no capitalization, 26 flags
|
||
|
>2 leshort 0x01 7-bit, no capitalization, 26 flags
|
||
|
>2 leshort 0x02 8-bit, capitalization, 26 flags
|
||
|
>2 leshort 0x03 7-bit, capitalization, 26 flags
|
||
|
>2 leshort 0x04 8-bit, no capitalization, 52 flags
|
||
|
>2 leshort 0x05 7-bit, no capitalization, 52 flags
|
||
|
>2 leshort 0x06 8-bit, capitalization, 52 flags
|
||
|
>2 leshort 0x07 7-bit, capitalization, 52 flags
|
||
|
>2 leshort 0x08 8-bit, no capitalization, 128 flags
|
||
|
>2 leshort 0x09 7-bit, no capitalization, 128 flags
|
||
|
>2 leshort 0x0A 8-bit, capitalization, 128 flags
|
||
|
>2 leshort 0x0B 7-bit, capitalization, 128 flags
|
||
|
>2 leshort 0x0C 8-bit, no capitalization, 256 flags
|
||
|
>2 leshort 0x0D 7-bit, no capitalization, 256 flags
|
||
|
>2 leshort 0x0E 8-bit, capitalization, 256 flags
|
||
|
>2 leshort 0x0F 7-bit, capitalization, 256 flags
|
||
|
>4 leshort >0 and %d string characters
|
||
|
0 beshort&0xFFFC 0x9600 big endian ispell
|
||
|
>1 byte 0 hash file (?),
|
||
|
>1 byte 1 3.0 hash file,
|
||
|
>1 byte 2 3.1 hash file,
|
||
|
>1 byte 3 hash file (?),
|
||
|
>2 beshort 0x00 8-bit, no capitalization, 26 flags
|
||
|
>2 beshort 0x01 7-bit, no capitalization, 26 flags
|
||
|
>2 beshort 0x02 8-bit, capitalization, 26 flags
|
||
|
>2 beshort 0x03 7-bit, capitalization, 26 flags
|
||
|
>2 beshort 0x04 8-bit, no capitalization, 52 flags
|
||
|
>2 beshort 0x05 7-bit, no capitalization, 52 flags
|
||
|
>2 beshort 0x06 8-bit, capitalization, 52 flags
|
||
|
>2 beshort 0x07 7-bit, capitalization, 52 flags
|
||
|
>2 beshort 0x08 8-bit, no capitalization, 128 flags
|
||
|
>2 beshort 0x09 7-bit, no capitalization, 128 flags
|
||
|
>2 beshort 0x0A 8-bit, capitalization, 128 flags
|
||
|
>2 beshort 0x0B 7-bit, capitalization, 128 flags
|
||
|
>2 beshort 0x0C 8-bit, no capitalization, 256 flags
|
||
|
>2 beshort 0x0D 7-bit, no capitalization, 256 flags
|
||
|
>2 beshort 0x0E 8-bit, capitalization, 256 flags
|
||
|
>2 beshort 0x0F 7-bit, capitalization, 256 flags
|
||
|
>4 beshort >0 and %d string characters
|
||
|
# ispell 4.0 hash files kromJx <kromJx@crosswinds.net>
|
||
|
# Ispell 4.0
|
||
|
0 string ISPL ispell
|
||
|
>4 long x hash file version %d,
|
||
|
>8 long x lexletters %d,
|
||
|
>12 long x lexsize %d,
|
||
|
>16 long x hashsize %d,
|
||
|
>20 long x stblsize %d
|
||
|
|
||
|
# Summary: affixes defition text files for Ispell/MySpell/Hunspell
|
||
|
# From: Joerg Jenderek
|
||
|
# URL: https://www.openoffice.org/lingucomponent/affix.readme
|
||
|
# https://man.archlinux.org/man/hunspell.5.en
|
||
|
# Reference: http://mark0.net/download/triddefs_xml.7z/defs/a/affix.trid.xml
|
||
|
# Note: called "Affix file" by TrID
|
||
|
# variant starting with comment character
|
||
|
0 ubyte 0x23
|
||
|
# look for SET character command followed by whitespace (seems to be often 1 space character) like in:
|
||
|
# /usr/share/calibre/dictionaries/en-GB/en-GB.aff
|
||
|
>0 search/60459 SET\040
|
||
|
# skip scripts like /bin/affixcompress /bin/setupcon /bin/imdbpy2sql.py by checking for valid character SET argument
|
||
|
# character SET argument like: UTF-8
|
||
|
>>&0 string UTF-8
|
||
|
>>>0 use spell-aff
|
||
|
# character SET argument like: ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15
|
||
|
>>&0 string ISO8859-
|
||
|
>>>0 use spell-aff
|
||
|
# character SET argument for Russian with Cyrillic alphabet like: KOI8-R KOI8-U
|
||
|
# no russian support until war against ukraine
|
||
|
>>&0 string KOI8-
|
||
|
#>>>0 use spell-aff
|
||
|
# character SET argument for languages with Cyrillic alphabet like: cp1251
|
||
|
# no cyrillic support until russia war against ukraine
|
||
|
>>&0 string cp1251
|
||
|
#>>>0 use spell-aff
|
||
|
# character SET argument for Indian Script Code for Information Interchange (ISCII) like: ISCII-DEVANAGARI
|
||
|
>>&0 string ISCII-
|
||
|
# no example found
|
||
|
>>>0 use spell-aff
|
||
|
# not "real" affix rule files but found as tests unit inside thunderbird sources like:
|
||
|
# 1463589.aff 1695964.aff 2970240.aff
|
||
|
>0 default x
|
||
|
# look for suffix SFX command followed by whitespace like in:
|
||
|
# 1695964.aff
|
||
|
>>0 search/164 SFX\040
|
||
|
>>>0 use spell-aff
|
||
|
# if not real Hunspell/MySpell affix look for ispell variant
|
||
|
>>0 default x
|
||
|
# URL: https://manpages.debian.org/testing/ispell/ispell.5.en.html
|
||
|
# look for ispell declaration like in: /usr/lib/ispell/espanol.aff
|
||
|
>>>0 search/8251 defstringtype
|
||
|
# defstringtype declaration start with unique name (like "list" "lat" "utf8" "iso" "nroff" often like formatter name)
|
||
|
# followed by formatter name (like "nroff" "tex")
|
||
|
# followed by suffix list (like ".mm" ".ms" ".me" ".man" ".NeXT" ".txt" ".list")
|
||
|
#>>>>&1 string x DECLARATION=%s
|
||
|
>>>>0 use spell-aff
|
||
|
# ispell variant without declaration like in: /usr/lib/ispell/bulgarian.aff /usr/lib/ispell/russian.aff
|
||
|
>>>0 default x
|
||
|
# skip /etc/nilfs_cleanerd.conf by looking for ispell suffix section
|
||
|
>>>>0 search/3233 suffixes\n
|
||
|
>>>>>0 use spell-aff
|
||
|
# variant starting with empty line and comment character at the beginning of 2nd line like in: /usr/lib/ispell/polish.aff
|
||
|
0 ubeshort 0x0a23
|
||
|
# skip /etc/discover-modprobe.conf by looking for ispell declaration
|
||
|
>2 search/3118 defstringtype
|
||
|
>>0 use spell-aff
|
||
|
# starting with UTF-8 Byte Order Mark (BOM) https://en.wikipedia.org/wiki/Byte_order_mark
|
||
|
0 string \xEF\xBB\xBF
|
||
|
# starting with UTF-8 Byte Order Mark (BOM) followed by comment starting character
|
||
|
>3 string \x23
|
||
|
# starting with UTF-8 BOM and with SET character command followed by whitespace
|
||
|
# like in: /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/lt.aff
|
||
|
# look for character SET command used in MySpell and Hunspell
|
||
|
>3 search/9883 SET\040
|
||
|
>>0 use spell-aff
|
||
|
# look for FLAG type command used in MySpell and Hunspell
|
||
|
0 string FLAG
|
||
|
# followed by space character like in
|
||
|
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/en_US.aff
|
||
|
>4 ubyte 0x20
|
||
|
>>0 use spell-aff
|
||
|
# or followed by tabulator character like in
|
||
|
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
|
||
|
>4 ubyte 0x09
|
||
|
>>0 use spell-aff
|
||
|
# starting with character SET command used in MySpell and Hunspell like in: org/languagetool/resource/sv/hunspell/sv_SE.aff
|
||
|
0 string SET\040
|
||
|
>0 use spell-aff
|
||
|
# starting with language code LANG used in MySpell and Hunspell like in: /usr/share/hunspell/tr_TR.aff
|
||
|
0 string LANG\040
|
||
|
>0 use spell-aff
|
||
|
# starting with affix flag command AF used in MySpell and Hunspell like in: /usr/lib/thunderbird/extensions/langpack-hu@thunderbird.mozilla.org/dictionaries/hu.aff
|
||
|
0 string AF\040
|
||
|
# look for number of flag vector aliases
|
||
|
>3 regex [0-9]{1,4}
|
||
|
>>0 use spell-aff
|
||
|
# display information (encoding,language,...) about affixes rules text for Ispell/MySpell/Hunspell
|
||
|
0 name spell-aff
|
||
|
>1 ubeshort x affix definition
|
||
|
#!:mime text/plain
|
||
|
!:mime text/x-affix
|
||
|
!:ext aff
|
||
|
# GRR: need extra test so that default clause works
|
||
|
>0 ubyte x
|
||
|
# look for ispell declaration
|
||
|
>>0 search/8251 defstringtype for Ispell
|
||
|
# ispell variant without declaration
|
||
|
>>0 default x
|
||
|
# look for ispell suffixes command
|
||
|
>>>0 search/3233 suffixes
|
||
|
# skip "suffixes used to create first part of a compound" by checking for flag argument like in: languagetool\resource\sv\hunspell\sv_SE.aff
|
||
|
>>>>&0 search/2 flag for Ispell
|
||
|
>>>>&0 default x for MySpell/Hunspell
|
||
|
# without suffixes keyword
|
||
|
>>>0 default x for MySpell/Hunspell
|
||
|
# look for language code command used in MySpell and Hunspell
|
||
|
# like in: /usr/share/hunspell/de_AT.aff /usr/share/hunspell/it_IT.aff /usr/share/hunspell/tr_TR.aff /usr/lib/firefox/browser/extensions/langpack-hu@firefox.mozilla.org/dictionaries/hu.aff
|
||
|
>>0 search/1117643 LANG\040 \b, language
|
||
|
# language code argument like: de_DE hu_HU it_IT mn_MN tr_TR
|
||
|
>>>&0 string x %s
|
||
|
# look for character SET command used in MySpell and Hunspell
|
||
|
>>0 search/1117729 SET
|
||
|
# skip SETTINGS like in /usr/lib/ispell/ngerman.aff
|
||
|
# SET command followed often by space character (0x20) or tabulator (0x09) like in
|
||
|
# /opt/Wolfram/WolframEngine/13.1/SystemFiles/Components/SpellingData/SpellingDictionaries/ar.aff
|
||
|
>>>&0 ubyte&0xD6 =0x00
|
||
|
# skip SSET # schosS in /usr/lib/ispell/ogerman.aff
|
||
|
>>>>&0 ubyte >0x48 \b,
|
||
|
# character SET argument like: cp1251 ISCII-DEVANAGAR ISO8859-1 - ISO8859-10 ISO8859-13 - ISO8859-15 KOI8-R KOI8-U UTF-8
|
||
|
>>>>>&-1 string x "%s" encoded
|
||
|
# for control reasons show first non empty lines for ASCII or ISO-8859 text variant
|
||
|
>1 ubeshort !0xBBBF
|
||
|
# 1st line starting with 0x0A like in /usr/src/dicts/sjp-ispell-pl-20140213/polish.aff
|
||
|
>>0 ubyte =0x0A
|
||
|
>>>1 ubyte !0x0A \b, 2nd line
|
||
|
>>>>&-1 string x "%s"
|
||
|
# 3rd line starting with 0x0A like in polish.aff
|
||
|
>>>>>&1 ubyte =0x0A
|
||
|
>>>>>>&0 string x \b, 4th line "%s"
|
||
|
# 1st line starting with ASCII text like:
|
||
|
# this is the affix file of the de_DE Hunspell dictionary
|
||
|
>>0 ubyte !0x0A
|
||
|
>>>0 string x \b, 1st line "%s"
|
||
|
>>>>&1 ubyte >0x1F \b, 2nd line
|
||
|
>>>>>&-1 string x "%s"
|
||
|
# 2nd line starting with 0x0A like in /usr/lib/ispell/bulgarian.aff
|
||
|
>>>>&1 ubyte =0x0A \b, 3rd line
|
||
|
>>>>>&0 string x "%s"
|
||
|
# for control reasons show first lines for variant starting with ByteOrderMark (BOM=\xEF\xBB\xBF)
|
||
|
>1 ubeshort =0xBBBF \b, with BOM
|
||
|
>>3 string x \b, 1st line "%s"
|
||
|
>>>&1 ubyte >0x1F \b, 2nd line
|
||
|
>>>>&-1 string x "%s"
|