uboot: (firmwareOdroidC2/C4) don't invoke patch tool, use patches = [] instead

https://github.com/NixOS/nixpkgs/blob/master/pkgs/stdenv/generic/setup.sh#L948
this can do it nicely.

Signed-off-by: Anton Arapov <anton@deadbeef.mx>
This commit is contained in:
Anton Arapov 2021-04-03 12:58:10 +02:00 committed by Alan Daniels
commit 56de2bcd43
30691 changed files with 3076956 additions and 0 deletions

View file

@ -0,0 +1,11 @@
--- Makefile.in~ 2011-03-06 18:52:54.000000000 +0100
+++ Makefile.in 2014-01-29 19:04:51.384844897 +0100
@@ -123,7 +123,7 @@
%: %.o
$(LIBTOOL) --tag=CC --mode=link $(CC) -o $@ -static \
- $^ $(OBJS) $(LDFLAGS) -lz ${LIBS}
+ $(^:.o=.lo) $(OBJS) $(LDFLAGS) -lz ${LIBS}
include $(srcdir)/deps

View file

@ -0,0 +1,37 @@
{ lib, stdenv, fetchurl, which, bison, flex, libmaa, zlib, libtool }:
stdenv.mkDerivation rec {
pname = "dictd";
version = "1.13.1";
src = fetchurl {
url = "mirror://sourceforge/dict/dictd-${version}.tar.gz";
sha256 = "sha256-5PGmfRaJTYSUVp19yUQsFcw4wBHyuWMcfxzGInZlKhs=";
};
buildInputs = [ libmaa zlib ];
nativeBuildInputs = [ bison flex libtool which ];
# In earlier versions, parallel building was not supported but it's OK with 1.13
enableParallelBuilding = true;
patchPhase = "patch -p0 < ${./buildfix.diff}";
configureFlags = [
"--datadir=/run/current-system/sw/share/dictd"
"--sysconfdir=/etc"
];
postInstall = ''
install -Dm444 -t $out/share/doc/${pname} NEWS README
'';
meta = with lib; {
description = "Dict protocol server and client";
homepage = "http://www.dict.org";
license = licenses.gpl2;
maintainers = with maintainers; [ ];
platforms = platforms.linux;
};
}

View file

@ -0,0 +1,83 @@
{ stdenv, lib, dict }:
({ dictlist, allowList ? [ "127.0.0.1" ], denyList ? [ ] }:
/*
dictlist is a list of form
[ { filename = /path/to/files/basename;
name = "name"; } ]
basename.dict.dz and basename.index should be
dict files. Or look below for other options.
allowList is a list of IP/domain *-wildcarded strings
denyList is the same..
*/
let
link_arguments = map
(x: '' "${x.filename}" '')
dictlist;
databases = lib.concatStrings (map
(x:
"${x.name} ${x.filename}\n")
dictlist);
allow = lib.concatStrings (map (x: "allow ${x}\n") allowList);
deny = lib.concatStrings (map (x: "deny ${x}\n") denyList);
accessSection = "
access {
${allow}
${deny}
}
";
installPhase = ''
mkdir -p $out/share/dictd
cd $out/share/dictd
echo "${databases}" >databases.names
echo "${accessSection}" > dictd.conf
for j in ${toString link_arguments}; do
name="$(egrep ' '"$j"\$ databases.names)"
name=''${name% $j}
if test -d "$j"; then
if test -d "$j"/share/dictd ; then
echo "Got store path $j"
j="$j"/share/dictd
fi
echo "Directory reference: $j"
i=$(ls "$j""/"*.index)
i="''${i%.index}";
else
i="$j";
fi
echo "Basename is $i"
locale=$(cat "$(dirname "$i")"/locale)
base="$(basename "$i")"
echo "Locale is $locale"
export LC_ALL=$locale
export LANG=$locale
if test -e "$i".dict.dz; then
ln -s "$i".dict.dz
else
cp "$i".dict .
dictzip "$base".dict
fi
ln -s "$i".index .
dictfmt_index2word --locale $locale < "$base".index > "$base".word || true
dictfmt_index2suffix --locale $locale < "$base".index > "$base".suffix || true
echo "database $name {" >> dictd.conf
echo " data $out/share/dictd/$base.dict.dz" >> dictd.conf
echo " index $out/share/dictd/$base.index" >> dictd.conf
echo " index_word $out/share/dictd/$base.word" >> dictd.conf
echo " index_suffix $out/share/dictd/$base.suffix" >> dictd.conf
echo "}" >> dictd.conf
done
'';
in
stdenv.mkDerivation {
name = "dictd-dbs";
buildInputs = [ dict ];
dontUnpack = true;
inherit installPhase;
})

View file

@ -0,0 +1,95 @@
{ lib, stdenv, fetchurl, callPackage }:
let
# Probably a bug in some FreeDict release files, but easier to trivially
# work around than report. Not that it can cause any other problems..
makeDictdDBFreedict = src: name: locale:
makeDictdDB src name "{.,bin}" locale;
makeDictdDB = src: _name: _subdir: _locale:
stdenv.mkDerivation {
name = "dictd-db-${_name}";
inherit src;
locale = _locale;
dbName = _name;
dontBuild = true;
unpackPhase = ''
tar xf ${src}
'';
installPhase = ''
mkdir -p $out/share/dictd
cp $(ls ./${_subdir}/*.{dict*,index} || true) $out/share/dictd
echo "${_locale}" >$out/share/dictd/locale
'';
meta = {
description = "dictd-db dictionary for dictd";
platforms = lib.platforms.linux;
};
};
in rec {
deu2eng = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/deu-eng.tar.gz";
sha256 = "0dqrhv04g4f5s84nbgisgcfwk5x0rpincif0yfhfh4sc1bsvzsrb";
}) "deu-eng" "de_DE";
eng2deu = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/eng-deu.tar.gz";
sha256 = "01x12p72sa3071iff3jhzga8588440f07zr56r3x98bspvdlz73r";
}) "eng-deu" "en_EN";
nld2eng = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/nld-eng.tar.gz";
sha256 = "1vhw81pphb64fzsjvpzsnnyr34ka2fxizfwilnxyjcmpn9360h07";
}) "nld-eng" "nl_NL";
eng2nld = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/eng-nld.tar.gz";
sha256 = "0rcg28ldykv0w2mpxc6g4rqmfs33q7pbvf68ssy1q9gpf6mz7vcl";
}) "eng-nld" "en_UK";
eng2rus = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/eng-rus.tar.gz";
sha256 = "15409ivhww1wsfjr05083pv6mg10bak8v5pg1wkiqybk7ck61rry";
}) "eng-rus" "en_UK";
fra2eng = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/fra-eng.tar.gz";
sha256 = "0sdd88s2zs5whiwdf3hd0s4pzzv75sdsccsrm1wxc87l3hjm85z3";
}) "fra-eng" "fr_FR";
eng2fra = makeDictdDBFreedict (fetchurl {
url = "mirror://sourceforge/freedict/eng-fra.tar.gz";
sha256 = "0fi6rrnbqnhc6lq8d0nmn30zdqkibrah0mxfg27hsn9z7alwbj3m";
}) "eng-fra" "en_UK";
mueller_eng2rus_pkg = makeDictdDB (fetchurl {
url = "mirror://sourceforge/mueller-dict/mueller-dict-3.1.tar.gz";
sha256 = "04r5xxznvmcb8hkxqbjgfh2gxvbdd87jnhqn5gmgvxxw53zpwfmq";
}) "mueller-eng-rus" "mueller-dict-*/dict" "en_UK";
mueller_enru_abbr = {
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-abbrev";
name = "mueller-abbr";
dbName = "mueller-abbr";
locale = "en_UK";
};
mueller_enru_base = {
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-base";
name = "mueller-base";
dbName = "mueller-base";
locale = "en_UK";
};
mueller_enru_dict = {
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-dict";
name = "mueller-dict";
dbName = "mueller-dict";
locale = "en_UK";
};
mueller_enru_geo = {
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-geo";
name = "mueller-geo";
dbName = "mueller-geo";
locale = "en_UK";
};
mueller_enru_names = {
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-names";
name = "mueller-names";
dbName = "mueller-names";
locale = "en_UK";
};
wordnet = callPackage ./dictd-wordnet.nix {};
wiktionary = callPackage ./wiktionary {};
}

View file

@ -0,0 +1,36 @@
{lib, stdenv, python2, wordnet, writeScript}:
stdenv.mkDerivation rec {
version = "542";
pname = "dict-db-wordnet";
buildInputs = [python2 wordnet];
convert = ./wordnet_structures.py;
builder = writeScript "builder.sh" ''
. ${stdenv}/setup
mkdir -p $out/share/dictd/
cd $out/share/dictd
for i in ${wordnet}/dict/data.*; do
DATA="$DATA `echo $i | sed -e s,data,index,` $i";
done
python ${convert} $DATA
echo en_US.UTF-8 > locale
'';
meta = {
description = "dictd-compatible version of WordNet";
longDescription =
'' WordNet® is a large lexical database of English. This package makes
the wordnet data available to dictd and by extension for lookup with
the dict command. '';
homepage = "https://wordnet.princeton.edu/";
maintainers = [ ];
platforms = lib.platforms.all;
};
}

View file

@ -0,0 +1,22 @@
{ lib, stdenv, fetchurl, libtool }:
stdenv.mkDerivation rec {
version = "1.3.2";
pname = "libmaa";
src = fetchurl {
url = "mirror://sourceforge/dict/libmaa-${version}.tar.gz";
sha256 = "1idi4c30pi79g5qfl7rr9s17krbjbg93bi8f2qrbsdlh78ga19ar";
};
buildInputs = [ libtool ];
# configureFlags = [ "--datadir=/run/current-system/share/dictd" ];
NIX_CFLAGS_COMPILE = "-Wno-error=format-truncation";
meta = with lib; {
description = "Dict protocol server and client";
maintainers = [ ];
platforms = platforms.linux;
};
}

View file

@ -0,0 +1,35 @@
{ lib, stdenv, fetchurl, python2, dict, glibcLocales }:
stdenv.mkDerivation rec {
pname = "dict-db-wiktionary";
version = "20220420";
src = fetchurl {
url = "https://dumps.wikimedia.org/enwiktionary/${version}/enwiktionary-${version}-pages-articles.xml.bz2";
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
};
# script in nixpkgs does not support python2
nativeBuildInputs = [ python2 dict glibcLocales ];
dontUnpack = true;
installPhase = ''
mkdir -p $out/share/dictd/
cd $out/share/dictd
${python2.interpreter} -O ${./wiktionary2dict.py} "${src}"
dictzip wiktionary-en.dict
echo en_US.UTF-8 > locale
'';
passthru.updateScript = ./update.sh;
meta = with lib; {
description = "DICT version of English Wiktionary";
homepage = "https://en.wiktionary.org/";
maintainers = with maintainers; [ qyliss ];
platforms = platforms.all;
license = with licenses; [ cc-by-sa-30 fdl11Plus ];
};
}

View file

@ -0,0 +1,42 @@
import subprocess
from html.parser import HTMLParser
from os.path import abspath, dirname
from urllib.request import urlopen
class WiktionaryLatestVersionParser(HTMLParser):
def __init__(self, current_version, *args, **kwargs):
self.latest_version = current_version
super().__init__(*args, **kwargs)
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
href = dict(attrs)['href'][0:-1]
if href == 'latest':
return
self.latest_version = max(self.latest_version, href)
def nix_prefetch_url(url, algo='sha256'):
"""Prefetches the content of the given URL."""
print(f'nix-prefetch-url {url}')
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
return out.decode('utf-8').rstrip()
current_version = subprocess.check_output([
'nix', 'eval', '--raw',
'-f', dirname(abspath(__file__)) + '/../../../..',
'dictdDBs.wiktionary.version',
]).decode('utf-8')
parser = WiktionaryLatestVersionParser(current_version)
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
parser.feed(resp.read().decode('utf-8'))
print(parser.latest_version)

View file

@ -0,0 +1,7 @@
#! /usr/bin/env nix-shell
#! nix-shell -i bash -p common-updater-scripts python3
set -ueo pipefail
version="$(python "$(dirname "${BASH_SOURCE[0]}")"/latest_version.py)"
update-source-version dictdDBs.wiktionary "$version"

View file

@ -0,0 +1,778 @@
# Adapted to produce DICT-compatible files by Petr Rockai in 2012
# Based on code from wiktiondict by Greg Hewgill
import re
import sys
import codecs
import os
import textwrap
import time
import xml.sax
class Text:
def __init__(self, s):
self.s = s
def process(self):
return s
class TemplateCall:
def __init__(self):
pass
def process(self):
pass
class Template:
def __init__(self):
self.parts = []
def append(self, part):
self.parts.append(part)
def process(self):
return ''.join(x.process() for x in self.parts)
class Whitespace:
def __init__(self, s):
self.s = s
class OpenDouble: pass
class OpenTriple: pass
class CloseDouble: pass
class CloseTriple: pass
class Equals:
def __str__(self):
return "="
class Delimiter:
def __init__(self, c):
self.c = c
def __str__(self):
return self.c
def Tokenise(s):
s = unicode(s)
stack = []
last = 0
i = 0
while i < len(s):
if s[i] == '{' and i+1 < len(s) and s[i+1] == '{':
if i > last:
yield s[last:i]
if i+2 < len(s) and s[i+2] == '{':
yield OpenTriple()
stack.append(3)
i += 3
else:
yield OpenDouble()
stack.append(2)
i += 2
last = i
elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}':
if i > last:
yield s[last:i]
if len(stack) == 0:
yield "}}"
i += 2
elif stack[-1] == 2:
yield CloseDouble()
i += 2
stack.pop()
elif i+2 < len(s) and s[i+2] == '}':
yield CloseTriple()
i += 3
stack.pop()
else:
raise SyntaxError()
last = i
elif s[i] == ':' or s[i] == '|':
if i > last:
yield s[last:i]
yield Delimiter(s[i])
i += 1
last = i
elif s[i] == '=':
if i > last:
yield s[last:i]
yield Equals()
i += 1
last = i
#elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
# if i > last:
# yield s[last:i]
# last = i
# m = re.match(r"\s+", s[i:])
# assert m
# yield Whitespace(m.group(0))
# i += len(m.group(0))
# last = i
else:
i += 1
if i > last:
yield s[last:i]
def processSub(templates, tokens, args):
t = tokens.next()
if not isinstance(t, unicode):
raise SyntaxError
name = t
t = tokens.next()
default = None
if isinstance(t, Delimiter) and t.c == '|':
default = ""
while True:
t = tokens.next()
if isinstance(t, unicode):
default += t
elif isinstance(t, OpenDouble):
default += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple):
default += processSub(templates, tokens, args)
elif isinstance(t, CloseTriple):
break
else:
print "Unexpected:", t
raise SyntaxError()
if name in args:
return args[name]
if default is not None:
return default
if name == "lang":
return "en"
return "{{{%s}}}" % name
def processTemplateCall(templates, tokens, args):
template = tokens.next().strip().lower()
args = {}
a = 1
t = tokens.next()
while True:
if isinstance(t, Delimiter):
name = unicode(a)
arg = ""
while True:
t = tokens.next()
if isinstance(t, unicode):
arg += t
elif isinstance(t, OpenDouble):
arg += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple):
arg += processSub(templates, tokens, args)
elif isinstance(t, Delimiter) and t.c != '|':
arg += str(t)
else:
break
if isinstance(t, Equals):
name = arg.strip()
arg = ""
while True:
t = tokens.next()
if isinstance(t, (unicode, Equals)):
arg += unicode(t)
elif isinstance(t, OpenDouble):
arg += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple):
arg += processSub(templates, tokens, args)
elif isinstance(t, Delimiter) and t.c != '|':
arg += str(t)
else:
break
arg = arg.strip()
else:
a += 1
args[name] = arg
elif isinstance(t, CloseDouble):
break
else:
print "Unexpected:", t
raise SyntaxError
#print template, args
if template[0] == '#':
if template == "#if":
if args['1'].strip():
return args['2']
elif '3' in args:
return args['3']
else:
return ""
elif template == "#ifeq":
if args['1'].strip() == args['2'].strip():
return args['3']
elif '4' in args:
return args['4']
else:
return ""
elif template == "#ifexist":
return ""
elif template == "#switch":
sw = args['1'].strip()
if sw in args:
return args[sw]
else:
return ""
else:
print "Unknown ParserFunction:", template
sys.exit(1)
if template not in templates:
return "{{%s}}" % template
return process(templates, templates[template], args)
def process(templates, s, args = {}):
s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s)
s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s)
assert "<onlyinclude>" not in s
#s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s)
r = ""
#print list(Tokenise(s))
tokens = Tokenise(s)
try:
while True:
t = tokens.next()
if isinstance(t, OpenDouble):
r += processTemplateCall(templates, tokens, args)
elif isinstance(t, OpenTriple):
r += processSub(templates, tokens, args)
else:
r += unicode(t)
except StopIteration:
pass
return r
def test():
templates = {
'lb': "{{",
'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
't': "start-{{{1|pqr}}}-end",
't0': "start-{{{1}}}-end",
't1': "start{{{1}}}end<noinclude>moo</noinclude>",
't2a1': "{{t2demo|a|{{{1}}}}}",
't2a2': "{{t2demo|a|2={{{1}}}}}",
't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
't5': "{{t2demo|{{{a}}}=b}}",
't6': "t2demo|a",
}
def t(text, expected):
print "text:", text
s = process(templates, text)
if s != expected:
print "got:", s
print "expected:", expected
sys.exit(1)
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
t("{{t0|a}}", "start-a-end")
t("{{t0| }}", "start- -end")
t("{{t0|}}", "start--end")
t("{{t0}}", "start-{{{1}}}-end")
t("{{t0| }}", "start- -end")
t("{{t0|\n}}", "start-\n-end")
t("{{t0|1= }}", "start--end")
t("{{t0|1=\n}}", "start--end")
t("{{T}}", "start-pqr-end")
t("{{T|}}", "start--end")
t("{{T|abc}}", "start-abc-end")
t("{{T|abc|def}}", "start-abc-end")
t("{{T|1=abc|1=def}}", "start-def-end")
t("{{T|abc|1=def}}", "start-def-end")
t("{{T|1=abc|def}}", "start-def-end")
t("{{T|{{T}}}}", "start-start-pqr-end-end")
t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
t("{{T|a=b}}", "start-pqr-end")
t("{{T|1=a=b}}", "start-a=b-end")
#t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
#t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
#t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
#t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
#t("{{ {{t6}} }}", "{{ t2demo|a }}")
t("{{t|[[a|b]]}}", "start-b-end")
t("{{t|[[a|b]] }}", "start-b -end")
Parts = {
# Standard POS headers
'noun': "n.",
'Noun': "n.",
'Noun 1': "n.",
'Noun 2': "n.",
'Verb': "v.",
'Adjective': "adj.",
'Adverb': "adv.",
'Pronoun': "pron.",
'Conjunction': "conj.",
'Interjection': "interj.",
'Preposition': "prep.",
'Proper noun': "n.p.",
'Proper Noun': "n.p.",
'Article': "art.",
# Standard non-POS level 3 headers
'{{acronym}}': "acr.",
'Acronym': "acr.",
'{{abbreviation}}': "abbr.",
'[[Abbreviation]]': "abbr.",
'Abbreviation': "abbr.",
'[[initialism]]': "init.",
'{{initialism}}': "init.",
'Initialism': "init.",
'Contraction': "cont.",
'Prefix': "prefix",
'Suffix': "suffix",
'Symbol': "sym.",
'Letter': "letter",
'Idiom': "idiom",
'Idioms': "idiom",
'Phrase': "phrase",
# Debated POS level 3 headers
'Number': "num.",
'Numeral': "num.",
'Cardinal number': "num.",
'Ordinal number': "num.",
'Cardinal numeral': "num.",
'Ordinal numeral': "num.",
# Other headers in use
'Personal pronoun': "pers.pron.",
'Adjective/Adverb': "adj./adv.",
'Proper adjective': "prop.adj.",
'Determiner': "det.",
'Demonstrative determiner': "dem.det.",
'Clitic': "clitic",
'Infix': "infix",
'Counter': "counter",
'Kanji': None,
'Kanji reading': None,
'Hiragana letter': None,
'Katakana letter': None,
'Pinyin': None,
'Han character': None,
'Hanzi': None,
'Hanja': None,
'Proverb': "prov.",
'Expression': None,
'Adjectival noun': None,
'Quasi-adjective': None,
'Particle': "part.",
'Infinitive particle': "part.",
'Possessive adjective': "poss.adj.",
'Verbal prefix': "v.p.",
'Postposition': "post.",
'Prepositional article': "prep.art.",
'Phrasal verb': "phr.v.",
'Participle': "participle",
'Interrogative auxiliary verb': "int.aux.v.",
'Pronominal adverb': "pron.adv.",
'Adnominal': "adn.",
'Abstract pronoun': "abs.pron.",
'Conjunction particle': None,
'Root': "root",
# Non-standard, deprecated headers
'Noun form': "n.",
'Verb form': "v.",
'Adjective form': "adj.form.",
'Nominal phrase': "nom.phr.",
'Noun phrase': "n. phrase",
'Verb phrase': "v. phrase",
'Transitive verb': "v.t.",
'Intransitive verb': "v.i.",
'Reflexive verb': "v.r.",
'Cmavo': None,
'Romaji': "rom.",
'Hiragana': None,
'Furigana': None,
'Compounds': None,
# Other headers seen
'Alternative forms': None,
'Alternative spellings': None,
'Anagrams': None,
'Antonym': None,
'Antonyms': None,
'Conjugation': None,
'Declension': None,
'Declension and pronunciations': None,
'Definite Article': "def.art.",
'Definite article': "def.art.",
'Demonstrative pronoun': "dem.pron.",
'Derivation': None,
'Derived expression': None,
'Derived expressions': None,
'Derived forms': None,
'Derived phrases': None,
'Derived terms': None,
'Derived, Related terms': None,
'Descendants': None,
#'Etymology': None,
#'Etymology 1': None,
#'Etymology 2': None,
#'Etymology 3': None,
#'Etymology 4': None,
#'Etymology 5': None,
'Examples': None,
'External links': None,
'[[Gismu]]': None,
'Gismu': None,
'Homonyms': None,
'Homophones': None,
'Hyphenation': None,
'Indefinite article': "art.",
'Indefinite pronoun': "ind.pron.",
'Indefinite Pronoun': "ind.pron.",
'Indetermined pronoun': "ind.pron.",
'Interrogative conjunction': "int.conj.",
'Interrogative determiner': "int.det.",
'Interrogative particle': "int.part.",
'Interrogative pronoun': "int.pron.",
'Legal expression': "legal",
'Mass noun': "n.",
'Miscellaneous': None,
'Mutations': None,
'Noun and verb': "n/v.",
'Other language': None,
'Pinyin syllable': None,
'Possessive determiner': "poss.det.",
'Possessive pronoun': "poss.pron.",
'Prepositional phrase': "prep.phr.",
'Prepositional Pronoun': "prep.pron.",
'Pronunciation': None,
'Pronunciation 1': None,
'Pronunciation 2': None,
'Quotations': None,
'References': None,
'Reflexive pronoun': "refl.pron.",
'Related expressions': None,
'Related terms': None,
'Related words': None,
'Relative pronoun': "rel.pron.",
'Saying': "saying",
'See also': None,
'Shorthand': None,
'[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
'Sister projects': None,
'Spelling note': None,
'Synonyms': None,
'Translation': None,
'Translations': None,
'Translations to be checked': None,
'Transliteration': None,
'Trivia': None,
'Usage': None,
'Usage in English': None,
'Usage notes': None,
'Verbal noun': "v.n.",
}
PartsUsed = {}
for p in Parts.keys():
PartsUsed[p] = 0
def encode(s):
r = e(s)
assert r[1] == len(s)
return r[0]
def dowikilink(m):
a = m.group(1).split("|")
if len(a) > 1:
link = a[1]
else:
link = a[0]
if ':' in link:
link = ""
return link
seentemplates = {}
def dotemplate(m):
aa = m.group(1).split("|")
args = {}
n = 0
for a in aa:
am = re.match(r"(.*?)(=(.*))?", a)
if am:
args[am.group(1)] = am.group(3)
else:
n += 1
args[n] = am.group(1)
#if aa[0] in seentemplates:
# seentemplates[aa[0]] += 1
#else:
# seentemplates[aa[0]] = 1
# print len(seentemplates), aa[0]
#print aa[0]
#if aa[0] not in Templates:
# return "(unknown template %s)" % aa[0]
#body = Templates[aa[0]]
#body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
#assert "<onlyinclude>" not in body
##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
#body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
#def dotemplatearg(m):
# ta = m.group(1).split("|")
# if ta[0] in args:
# return args[ta[0]]
# elif len(ta) > 1:
# return ta[1]
# else:
# return "{{{%s}}}" % ta[0]
#body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
#return dewiki(body)
def doparserfunction(m):
a = m.group(2).split("|")
if m.group(1) == "ifeq":
if a[0] == a[1]:
return a[2]
elif len(a) >= 4:
return a[3]
return ""
def dewiki(body, indent = 0):
# process in this order:
# {{{ }}}
# <> <>
# [[ ]]
# {{ }}
# ''' '''
# '' ''
#body = wikimediatemplate.process(Templates, body)
body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body)
#body = re.sub(r"{{(.*?)}}", dotemplate, body)
#body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
body = re.sub(r"'''(.*?)'''", r"\1", body)
body = re.sub(r"''(.*?)''", r"\1", body)
lines = body.split("\n")
n = 0
i = 0
while i < len(lines):
if len(lines[i]) > 0 and lines[i][0] == "#":
if len(lines[i]) > 1 and lines[i][1] == '*':
wlines = textwrap.wrap(lines[i][2:].strip(),
initial_indent = " * ",
subsequent_indent = " ")
elif len(lines[i]) > 1 and lines[i][1] == ':':
wlines = textwrap.wrap(lines[i][2:].strip(),
initial_indent = " ",
subsequent_indent = " ")
else:
n += 1
wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(),
subsequent_indent = " ")
elif len(lines[i]) > 0 and lines[i][0] == "*":
n = 0
wlines = textwrap.wrap(lines[i][1:].strip(),
initial_indent = "* ",
subsequent_indent = " ")
else:
n = 0
wlines = textwrap.wrap(lines[i].strip())
if len(wlines) == 0:
wlines = ['']
lines[i:i+1] = wlines
i += len(wlines)
return ''.join(" "*(indent-1)+x+"\n" for x in lines)
class WikiSection:
def __init__(self, heading, body):
self.heading = heading
self.body = body
#self.lines = re.split("\n+", body.strip())
#if len(self.lines) == 1 and len(self.lines[0]) == 0:
# self.lines = []
self.children = []
def __str__(self):
return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children]))
def add(self, section):
self.children.append(section)
def parse(word, text):
headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE))
#print [x.group(1) for x in headings]
doc = WikiSection(word, "")
stack = [doc]
for i, m in enumerate(headings):
depth = len(m.group(1))
if depth < len(stack):
stack = stack[:depth]
else:
while depth > len(stack):
s = WikiSection(None, "")
stack[-1].add(s)
stack.append(s)
if i+1 < len(headings):
s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip())
else:
s = WikiSection(m.group(2), text[m.end(0):].strip())
assert len(stack) == depth
stack[-1].add(s)
stack.append(s)
#while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
# doc = doc.children[0]
return doc
def formatFull(word, doc):
def f(depth, section):
if section.heading:
r = " "*(depth-1) + section.heading + "\n\n"
else:
r = ""
if section.body:
r += dewiki(section.body, depth+1)+"\n"
#r += "".join(" "*depth + x + "\n" for x in dewiki(section.body))
#if len(section.lines) > 0:
# r += "\n"
for c in section.children:
r += f(depth+1, c)
return r
s = f(0, doc)
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
return s
def formatNormal(word, doc):
def f(depth, posdepth, section):
r = ""
if depth == posdepth:
if not section.heading or section.heading.startswith("Etymology"):
posdepth += 1
elif section.heading in Parts:
#p = Parts[section.heading]
#if p:
# r += " "*(depth-1) + word + " (" + p + ")\n\n"
r += " "*(depth-1) + section.heading + "\n\n"
else:
print >>errors, "Unknown part: (%s) %s" % (word, section.heading)
return ""
elif depth > posdepth:
return ""
elif section.heading:
r += " "*(depth-1) + section.heading + "\n\n"
if section.body:
r += dewiki(section.body, depth+1)+"\n"
#r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines))
#if len(section.lines) > 0:
# r += "\n"
for c in section.children:
r += f(depth+1, posdepth, c)
return r
s = f(0, 3, doc)
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
return s
def formatBrief(word, doc):
def f(depth, posdepth, section):
if depth == posdepth:
h = section.heading
if not section.heading or section.heading.startswith("Etymology"):
posdepth += 1
elif section.heading in Parts:
#h = Parts[section.heading]
#if h:
# h = "%s (%s)" % (word, h)
pass
stack.append([h, False])
elif depth > 0:
stack.append([section.heading, False])
else:
stack.append(["%h " + section.heading, False])
r = ""
#if section.heading:
# r += " "*(depth-1) + section.heading + "\n"
body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#')
if len(body) > 0:
for i in range(len(stack)):
if not stack[i][1]:
if stack[i][0]:
r += " "*(i-1) + stack[i][0] + "\n"
stack[i][1] = True
r += dewiki(body, depth+1)
for c in section.children:
r += f(depth+1, posdepth, c)
stack.pop()
return r
stack = []
s = f(0, 3, doc)
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
return s
class WikiHandler(xml.sax.ContentHandler):
def __init__(self):
self.element = None
self.page = None
self.text = ""
self.long = {}
def startElement(self, name, attrs):
#print "start", name, attrs
self.element = name
def endElement(self, name):
#print "end", name
if self.element == "text":
if self.page:
if self.page in self.long:
print self.page, len(self.text)
print
self.doPage(self.page, self.text)
self.page = None
self.text = ""
self.element = None
def characters(self, content):
#print "characters", content
if self.element == "title":
if self.checkPage(content):
self.page = content
elif self.element == "text":
if self.page:
self.text += content
if len(self.text) > 100000 and self.page not in self.long:
self.long[self.page] = 1
def checkPage(self, page):
return False
def doPage(self, page, text):
pass
class TemplateHandler(WikiHandler):
def checkPage(self, page):
return page.startswith("Template:")
def doPage(self, page, text):
Templates[page[page.find(':')+1:].lower()] = text
class WordHandler(WikiHandler):
def checkPage(self, page):
return ':' not in page
def doPage(self, page, text):
m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE)
if m:
out.write(" See <%s>" % page)
return
doc = parse(page, text)
out.write(formatBrief(page, doc))
#print formatBrief(page, doc)
fn = sys.argv[1]
info = """ This file was converted from the original database on:
%s
The original data is available from:
http://en.wiktionary.org
The version from which this file was generated was:
%s
Wiktionary is available under the GNU Free Documentation License.
""" % (time.ctime(), os.path.basename(fn))
errors = codecs.open("mkdict.err", "w", "utf_8")
e = codecs.getencoder("utf_8")
Templates = {}
f = os.popen("bunzip2 -c %s" % fn, "r")
xml.sax.parse(f, TemplateHandler())
f.close()
f = os.popen("bunzip2 -c %s" % fn, "r")
out = codecs.getwriter("utf_8")(
os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8'))
xml.sax.parse(f, WordHandler())
f.close()
out.close()

View file

@ -0,0 +1,319 @@
#!/usr/bin/env python
#Copyright 2007 Sebastian Hagen
# This file is part of wordnet_tools.
# wordnet_tools is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2
# as published by the Free Software Foundation
# wordnet_tools is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with wordnet_tools; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
# This program requires python >= 2.4.
# This program converts wordnet index/data file pairs into dict index/data
# files usable by dictd.
# This is basically a reimplementation of the wnfilter program by Rik Faith,
# which unfortunately doesn't work correctly for wordnet files in the newer
# formats. This version of wordnet_structures whould parse wordnet 2.1 files
# correctly, and create output very similar to what wnfilter would have
# written.
import datetime
from textwrap import TextWrapper
CAT_ADJECTIVE = 0
CAT_ADVERB = 1
CAT_NOUN = 2
CAT_VERB = 3
category_map = {
'n': CAT_NOUN,
'v': CAT_VERB,
'a': CAT_ADJECTIVE,
's': CAT_ADJECTIVE,
'r': CAT_ADVERB
}
class WordIndex:
def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
self.lemma = lemma
self.category = category
self.ptrs = ptrs
self.synsets = synsets
self.tagsense_count = tagsense_count
@classmethod
def build_from_line(cls, line_data, synset_map):
line_split = line_data.split()
lemma = line_split[0]
category = category_map[line_split[1]]
synset_count = int(line_split[2],10)
ptr_count = int(line_split[3],10)
ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
tagsense_count = int(line_split[5 + ptr_count],10)
synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
return cls(lemma, category, ptrs, synsets, tagsense_count)
@classmethod
def build_from_file(cls, f, synset_map, rv_base=None):
if (rv_base is None):
rv = {}
else:
rv = rv_base
for line in f:
if (line.startswith(' ')):
continue
wi = cls.build_from_line(line, synset_map)
word = wi.lemma.lower()
if not (word in rv):
rv[word] = []
rv[word].append(wi)
return rv
def __repr__(self):
return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
class WordIndexDictFormatter(WordIndex):
category_map_rev = {
CAT_NOUN: 'n',
CAT_VERB: 'v',
CAT_ADJECTIVE: 'adj',
CAT_ADVERB: 'adv'
}
linesep = '\n'
LINE_WIDTH_MAX = 68
prefix_fmtf_line_first = '%5s 1: '
prefix_fmtn_line_first = ' '
prefix_fmtf_line_nonfirst = '%5d: '
prefix_fmtn_line_nonfirst = ' '
def dict_str(self):
tw = TextWrapper(width=self.LINE_WIDTH_MAX,
initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
subsequent_indent=self.prefix_fmtn_line_first)
lines = (tw.wrap(self.synsets[0].dict_str()))
i = 2
for synset in self.synsets[1:]:
tw = TextWrapper(width=self.LINE_WIDTH_MAX,
initial_indent=(self.prefix_fmtf_line_nonfirst % i),
subsequent_indent=self.prefix_fmtn_line_nonfirst)
lines.extend(tw.wrap(synset.dict_str()))
i += 1
return self.linesep.join(lines)
class Synset:
def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
self.offset = offset
self.type = ss_type
self.words = words
self.ptrs = ptrs
self.gloss = gloss
self.frames = frames
self.comments = []
@classmethod
def build_from_line(cls, line_data):
line_split = line_data.split()
synset_offset = int(line_split[0],10)
ss_type = category_map[line_split[2]]
word_count = int(line_split[3],16)
words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
ptr_count = int(line_split[4 + word_count*2],10)
ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
tok = line_split[5 + word_count*2 + ptr_count*4]
base = 6 + word_count*2 + ptr_count*4
if (tok != '|'):
frame_count = int(tok, 10)
frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
base += frame_count*3 + 1
else:
frames = []
line_split2 = line_data.split(None, base)
if (len(line_split2) < base):
gloss = None
else:
gloss = line_split2[-1]
return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
@classmethod
def build_from_file(cls, f):
rv = {}
comments = []
for line in f:
if (line.startswith(' ')):
line_s = line.lstrip().rstrip('\n')
line_elements = line_s.split(None,1)
try:
int(line_elements[0])
except ValueError:
continue
if (len(line_elements) == 1):
line_elements.append('')
comments.append(line_elements[1])
continue
synset = cls.build_from_line(line.rstrip())
rv[synset.offset] = synset
return (rv, comments)
def dict_str(self):
rv = self.gloss
if (len(self.words) > 1):
rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
return rv
def __repr__(self):
return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
class WordnetDict:
db_info_fmt = '''This file was converted from the original database on:
%(conversion_datetime)s
The original data is available from:
%(wn_url)s
The original data was distributed with the notice shown below. No
additional restrictions are claimed. Please redistribute this changed
version under the same conditions and restriction that apply to the
original version.\n\n
%(wn_license)s'''
datetime_fmt = '%Y-%m-%dT%H:%M:%S'
base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
def __init__(self, wn_url, desc_short, desc_long):
self.word_data = {}
self.wn_url = wn_url
self.desc_short = desc_short
self.desc_long = desc_long
self.wn_license = None
def wn_dict_add(self, file_index, file_data):
file_data.seek(0)
file_index.seek(0)
(synsets, license_lines) = Synset.build_from_file(file_data)
WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
if (license_lines):
self.wn_license = '\n'.join(license_lines) + '\n'
@classmethod
def base64_encode(cls, i):
"""Encode a non-negative integer into a dictd compatible base64 string"""
if (i < 0):
raise ValueError('Value %r for i is negative' % (i,))
r = 63
e = 1
while (r < i):
e += 1
r = 64**e - 1
rv = ''
while (e > 0):
e -= 1
d = (i / 64**e)
rv += cls.base64_map[d]
i = i % (64**e)
return rv
@classmethod
def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
"""Write a single dict entry for <key> to index and data files"""
entry_start = file_data.tell()
file_data.write(entry)
entry_len = len(entry)
file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
cls.base64_encode(entry_len), linesep))
def dict_generate(self, file_index, file_data):
file_index.seek(0)
file_data.seek(0)
# The dictd file format is fairly iffy on the subject of special
# headwords: either dictd is buggy, or the manpage doesn't tell the whole
# story about the format.
# The upshot is that order of these entries in the index *matters*.
# Putting them at the beginning and in alphabetic order is afaict ok.
# Some other orders completely and quietly break the ability to look
# those headwords up.
# -- problem encountered with 1.10.2, at 2007-08-05.
file_data.write('\n')
wn_url = self.wn_url
conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
wn_license = self.wn_license
self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
words = self.word_data.keys()
words.sort()
for word in words:
for wi in self.word_data[word]:
word_cs = word
# Use case-sensitivity information of first entry of first synset that
# matches this word case-insensitively
for synset in wi.synsets:
for ss_word in synset.words:
if (ss_word.lower() == word_cs.lower()):
word_cs = ss_word
break
else:
continue
break
else:
continue
break
outstr = ''
for wi in self.word_data[word]:
outstr += wi.dict_str() + '\n'
outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
file_index.truncate()
file_data.truncate()
if (__name__ == '__main__'):
import optparse
op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
(options, args) = op.parse_args()
wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
for i in range(0,len(args),2):
print 'Opening index file %r...' % args[i]
file_index = file(args[i])
print 'Opening data file %r...' % args[i+1]
file_data = file(args[i+1])
print 'Parsing index file and data file...'
wnd.wn_dict_add(file_index, file_data)
print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
print 'All done.'