uboot: (firmwareOdroidC2/C4) don't invoke patch tool, use patches = [] instead
https://github.com/NixOS/nixpkgs/blob/master/pkgs/stdenv/generic/setup.sh#L948 this can do it nicely. Signed-off-by: Anton Arapov <anton@deadbeef.mx>
This commit is contained in:
commit
56de2bcd43
30691 changed files with 3076956 additions and 0 deletions
11
pkgs/servers/dict/buildfix.diff
Normal file
11
pkgs/servers/dict/buildfix.diff
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
--- Makefile.in~ 2011-03-06 18:52:54.000000000 +0100
|
||||
+++ Makefile.in 2014-01-29 19:04:51.384844897 +0100
|
||||
@@ -123,7 +123,7 @@
|
||||
|
||||
%: %.o
|
||||
$(LIBTOOL) --tag=CC --mode=link $(CC) -o $@ -static \
|
||||
- $^ $(OBJS) $(LDFLAGS) -lz ${LIBS}
|
||||
+ $(^:.o=.lo) $(OBJS) $(LDFLAGS) -lz ${LIBS}
|
||||
|
||||
include $(srcdir)/deps
|
||||
|
||||
37
pkgs/servers/dict/default.nix
Normal file
37
pkgs/servers/dict/default.nix
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
{ lib, stdenv, fetchurl, which, bison, flex, libmaa, zlib, libtool }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
pname = "dictd";
|
||||
version = "1.13.1";
|
||||
|
||||
src = fetchurl {
|
||||
url = "mirror://sourceforge/dict/dictd-${version}.tar.gz";
|
||||
sha256 = "sha256-5PGmfRaJTYSUVp19yUQsFcw4wBHyuWMcfxzGInZlKhs=";
|
||||
};
|
||||
|
||||
buildInputs = [ libmaa zlib ];
|
||||
|
||||
nativeBuildInputs = [ bison flex libtool which ];
|
||||
|
||||
# In earlier versions, parallel building was not supported but it's OK with 1.13
|
||||
enableParallelBuilding = true;
|
||||
|
||||
patchPhase = "patch -p0 < ${./buildfix.diff}";
|
||||
|
||||
configureFlags = [
|
||||
"--datadir=/run/current-system/sw/share/dictd"
|
||||
"--sysconfdir=/etc"
|
||||
];
|
||||
|
||||
postInstall = ''
|
||||
install -Dm444 -t $out/share/doc/${pname} NEWS README
|
||||
'';
|
||||
|
||||
meta = with lib; {
|
||||
description = "Dict protocol server and client";
|
||||
homepage = "http://www.dict.org";
|
||||
license = licenses.gpl2;
|
||||
maintainers = with maintainers; [ ];
|
||||
platforms = platforms.linux;
|
||||
};
|
||||
}
|
||||
83
pkgs/servers/dict/dictd-db-collector.nix
Normal file
83
pkgs/servers/dict/dictd-db-collector.nix
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
{ stdenv, lib, dict }:
|
||||
({ dictlist, allowList ? [ "127.0.0.1" ], denyList ? [ ] }:
|
||||
|
||||
/*
|
||||
dictlist is a list of form
|
||||
[ { filename = /path/to/files/basename;
|
||||
name = "name"; } ]
|
||||
basename.dict.dz and basename.index should be
|
||||
dict files. Or look below for other options.
|
||||
allowList is a list of IP/domain *-wildcarded strings
|
||||
denyList is the same..
|
||||
*/
|
||||
|
||||
let
|
||||
link_arguments = map
|
||||
(x: '' "${x.filename}" '')
|
||||
dictlist;
|
||||
databases = lib.concatStrings (map
|
||||
(x:
|
||||
"${x.name} ${x.filename}\n")
|
||||
dictlist);
|
||||
allow = lib.concatStrings (map (x: "allow ${x}\n") allowList);
|
||||
deny = lib.concatStrings (map (x: "deny ${x}\n") denyList);
|
||||
accessSection = "
|
||||
access {
|
||||
${allow}
|
||||
${deny}
|
||||
}
|
||||
";
|
||||
installPhase = ''
|
||||
mkdir -p $out/share/dictd
|
||||
cd $out/share/dictd
|
||||
echo "${databases}" >databases.names
|
||||
echo "${accessSection}" > dictd.conf
|
||||
for j in ${toString link_arguments}; do
|
||||
name="$(egrep ' '"$j"\$ databases.names)"
|
||||
name=''${name% $j}
|
||||
if test -d "$j"; then
|
||||
if test -d "$j"/share/dictd ; then
|
||||
echo "Got store path $j"
|
||||
j="$j"/share/dictd
|
||||
fi
|
||||
echo "Directory reference: $j"
|
||||
i=$(ls "$j""/"*.index)
|
||||
i="''${i%.index}";
|
||||
else
|
||||
i="$j";
|
||||
fi
|
||||
echo "Basename is $i"
|
||||
locale=$(cat "$(dirname "$i")"/locale)
|
||||
base="$(basename "$i")"
|
||||
echo "Locale is $locale"
|
||||
export LC_ALL=$locale
|
||||
export LANG=$locale
|
||||
if test -e "$i".dict.dz; then
|
||||
ln -s "$i".dict.dz
|
||||
else
|
||||
cp "$i".dict .
|
||||
dictzip "$base".dict
|
||||
fi
|
||||
ln -s "$i".index .
|
||||
dictfmt_index2word --locale $locale < "$base".index > "$base".word || true
|
||||
dictfmt_index2suffix --locale $locale < "$base".index > "$base".suffix || true
|
||||
|
||||
echo "database $name {" >> dictd.conf
|
||||
echo " data $out/share/dictd/$base.dict.dz" >> dictd.conf
|
||||
echo " index $out/share/dictd/$base.index" >> dictd.conf
|
||||
echo " index_word $out/share/dictd/$base.word" >> dictd.conf
|
||||
echo " index_suffix $out/share/dictd/$base.suffix" >> dictd.conf
|
||||
echo "}" >> dictd.conf
|
||||
done
|
||||
'';
|
||||
|
||||
in
|
||||
|
||||
stdenv.mkDerivation {
|
||||
name = "dictd-dbs";
|
||||
|
||||
buildInputs = [ dict ];
|
||||
|
||||
dontUnpack = true;
|
||||
inherit installPhase;
|
||||
})
|
||||
95
pkgs/servers/dict/dictd-db.nix
Normal file
95
pkgs/servers/dict/dictd-db.nix
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
{ lib, stdenv, fetchurl, callPackage }:
|
||||
|
||||
let
|
||||
# Probably a bug in some FreeDict release files, but easier to trivially
|
||||
# work around than report. Not that it can cause any other problems..
|
||||
makeDictdDBFreedict = src: name: locale:
|
||||
makeDictdDB src name "{.,bin}" locale;
|
||||
|
||||
makeDictdDB = src: _name: _subdir: _locale:
|
||||
stdenv.mkDerivation {
|
||||
name = "dictd-db-${_name}";
|
||||
inherit src;
|
||||
locale = _locale;
|
||||
dbName = _name;
|
||||
dontBuild = true;
|
||||
unpackPhase = ''
|
||||
tar xf ${src}
|
||||
'';
|
||||
installPhase = ''
|
||||
mkdir -p $out/share/dictd
|
||||
cp $(ls ./${_subdir}/*.{dict*,index} || true) $out/share/dictd
|
||||
echo "${_locale}" >$out/share/dictd/locale
|
||||
'';
|
||||
|
||||
meta = {
|
||||
description = "dictd-db dictionary for dictd";
|
||||
platforms = lib.platforms.linux;
|
||||
};
|
||||
};
|
||||
in rec {
|
||||
deu2eng = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/deu-eng.tar.gz";
|
||||
sha256 = "0dqrhv04g4f5s84nbgisgcfwk5x0rpincif0yfhfh4sc1bsvzsrb";
|
||||
}) "deu-eng" "de_DE";
|
||||
eng2deu = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/eng-deu.tar.gz";
|
||||
sha256 = "01x12p72sa3071iff3jhzga8588440f07zr56r3x98bspvdlz73r";
|
||||
}) "eng-deu" "en_EN";
|
||||
nld2eng = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/nld-eng.tar.gz";
|
||||
sha256 = "1vhw81pphb64fzsjvpzsnnyr34ka2fxizfwilnxyjcmpn9360h07";
|
||||
}) "nld-eng" "nl_NL";
|
||||
eng2nld = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/eng-nld.tar.gz";
|
||||
sha256 = "0rcg28ldykv0w2mpxc6g4rqmfs33q7pbvf68ssy1q9gpf6mz7vcl";
|
||||
}) "eng-nld" "en_UK";
|
||||
eng2rus = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/eng-rus.tar.gz";
|
||||
sha256 = "15409ivhww1wsfjr05083pv6mg10bak8v5pg1wkiqybk7ck61rry";
|
||||
}) "eng-rus" "en_UK";
|
||||
fra2eng = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/fra-eng.tar.gz";
|
||||
sha256 = "0sdd88s2zs5whiwdf3hd0s4pzzv75sdsccsrm1wxc87l3hjm85z3";
|
||||
}) "fra-eng" "fr_FR";
|
||||
eng2fra = makeDictdDBFreedict (fetchurl {
|
||||
url = "mirror://sourceforge/freedict/eng-fra.tar.gz";
|
||||
sha256 = "0fi6rrnbqnhc6lq8d0nmn30zdqkibrah0mxfg27hsn9z7alwbj3m";
|
||||
}) "eng-fra" "en_UK";
|
||||
mueller_eng2rus_pkg = makeDictdDB (fetchurl {
|
||||
url = "mirror://sourceforge/mueller-dict/mueller-dict-3.1.tar.gz";
|
||||
sha256 = "04r5xxznvmcb8hkxqbjgfh2gxvbdd87jnhqn5gmgvxxw53zpwfmq";
|
||||
}) "mueller-eng-rus" "mueller-dict-*/dict" "en_UK";
|
||||
mueller_enru_abbr = {
|
||||
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-abbrev";
|
||||
name = "mueller-abbr";
|
||||
dbName = "mueller-abbr";
|
||||
locale = "en_UK";
|
||||
};
|
||||
mueller_enru_base = {
|
||||
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-base";
|
||||
name = "mueller-base";
|
||||
dbName = "mueller-base";
|
||||
locale = "en_UK";
|
||||
};
|
||||
mueller_enru_dict = {
|
||||
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-dict";
|
||||
name = "mueller-dict";
|
||||
dbName = "mueller-dict";
|
||||
locale = "en_UK";
|
||||
};
|
||||
mueller_enru_geo = {
|
||||
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-geo";
|
||||
name = "mueller-geo";
|
||||
dbName = "mueller-geo";
|
||||
locale = "en_UK";
|
||||
};
|
||||
mueller_enru_names = {
|
||||
outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-names";
|
||||
name = "mueller-names";
|
||||
dbName = "mueller-names";
|
||||
locale = "en_UK";
|
||||
};
|
||||
wordnet = callPackage ./dictd-wordnet.nix {};
|
||||
wiktionary = callPackage ./wiktionary {};
|
||||
}
|
||||
36
pkgs/servers/dict/dictd-wordnet.nix
Normal file
36
pkgs/servers/dict/dictd-wordnet.nix
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
{lib, stdenv, python2, wordnet, writeScript}:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
version = "542";
|
||||
pname = "dict-db-wordnet";
|
||||
|
||||
buildInputs = [python2 wordnet];
|
||||
convert = ./wordnet_structures.py;
|
||||
|
||||
builder = writeScript "builder.sh" ''
|
||||
. ${stdenv}/setup
|
||||
mkdir -p $out/share/dictd/
|
||||
cd $out/share/dictd
|
||||
|
||||
for i in ${wordnet}/dict/data.*; do
|
||||
DATA="$DATA `echo $i | sed -e s,data,index,` $i";
|
||||
done
|
||||
|
||||
python ${convert} $DATA
|
||||
echo en_US.UTF-8 > locale
|
||||
'';
|
||||
|
||||
meta = {
|
||||
description = "dictd-compatible version of WordNet";
|
||||
|
||||
longDescription =
|
||||
'' WordNet® is a large lexical database of English. This package makes
|
||||
the wordnet data available to dictd and by extension for lookup with
|
||||
the dict command. '';
|
||||
|
||||
homepage = "https://wordnet.princeton.edu/";
|
||||
|
||||
maintainers = [ ];
|
||||
platforms = lib.platforms.all;
|
||||
};
|
||||
}
|
||||
22
pkgs/servers/dict/libmaa.nix
Normal file
22
pkgs/servers/dict/libmaa.nix
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
{ lib, stdenv, fetchurl, libtool }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
version = "1.3.2";
|
||||
pname = "libmaa";
|
||||
|
||||
src = fetchurl {
|
||||
url = "mirror://sourceforge/dict/libmaa-${version}.tar.gz";
|
||||
sha256 = "1idi4c30pi79g5qfl7rr9s17krbjbg93bi8f2qrbsdlh78ga19ar";
|
||||
};
|
||||
|
||||
buildInputs = [ libtool ];
|
||||
# configureFlags = [ "--datadir=/run/current-system/share/dictd" ];
|
||||
|
||||
NIX_CFLAGS_COMPILE = "-Wno-error=format-truncation";
|
||||
|
||||
meta = with lib; {
|
||||
description = "Dict protocol server and client";
|
||||
maintainers = [ ];
|
||||
platforms = platforms.linux;
|
||||
};
|
||||
}
|
||||
35
pkgs/servers/dict/wiktionary/default.nix
Normal file
35
pkgs/servers/dict/wiktionary/default.nix
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
{ lib, stdenv, fetchurl, python2, dict, glibcLocales }:
|
||||
|
||||
stdenv.mkDerivation rec {
|
||||
pname = "dict-db-wiktionary";
|
||||
version = "20220420";
|
||||
|
||||
src = fetchurl {
|
||||
url = "https://dumps.wikimedia.org/enwiktionary/${version}/enwiktionary-${version}-pages-articles.xml.bz2";
|
||||
sha256 = "qsha26LL2513SDtriE/0zdPX1zlnpzk1KKk+R9dSdew=";
|
||||
};
|
||||
|
||||
# script in nixpkgs does not support python2
|
||||
nativeBuildInputs = [ python2 dict glibcLocales ];
|
||||
|
||||
dontUnpack = true;
|
||||
|
||||
installPhase = ''
|
||||
mkdir -p $out/share/dictd/
|
||||
cd $out/share/dictd
|
||||
|
||||
${python2.interpreter} -O ${./wiktionary2dict.py} "${src}"
|
||||
dictzip wiktionary-en.dict
|
||||
echo en_US.UTF-8 > locale
|
||||
'';
|
||||
|
||||
passthru.updateScript = ./update.sh;
|
||||
|
||||
meta = with lib; {
|
||||
description = "DICT version of English Wiktionary";
|
||||
homepage = "https://en.wiktionary.org/";
|
||||
maintainers = with maintainers; [ qyliss ];
|
||||
platforms = platforms.all;
|
||||
license = with licenses; [ cc-by-sa-30 fdl11Plus ];
|
||||
};
|
||||
}
|
||||
42
pkgs/servers/dict/wiktionary/latest_version.py
Normal file
42
pkgs/servers/dict/wiktionary/latest_version.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import subprocess
|
||||
|
||||
from html.parser import HTMLParser
|
||||
from os.path import abspath, dirname
|
||||
from urllib.request import urlopen
|
||||
|
||||
class WiktionaryLatestVersionParser(HTMLParser):
|
||||
def __init__(self, current_version, *args, **kwargs):
|
||||
self.latest_version = current_version
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag != 'a':
|
||||
return
|
||||
|
||||
href = dict(attrs)['href'][0:-1]
|
||||
if href == 'latest':
|
||||
return
|
||||
|
||||
self.latest_version = max(self.latest_version, href)
|
||||
|
||||
|
||||
def nix_prefetch_url(url, algo='sha256'):
|
||||
"""Prefetches the content of the given URL."""
|
||||
print(f'nix-prefetch-url {url}')
|
||||
out = subprocess.check_output(['nix-prefetch-url', '--type', algo, url])
|
||||
return out.decode('utf-8').rstrip()
|
||||
|
||||
|
||||
current_version = subprocess.check_output([
|
||||
'nix', 'eval', '--raw',
|
||||
'-f', dirname(abspath(__file__)) + '/../../../..',
|
||||
'dictdDBs.wiktionary.version',
|
||||
]).decode('utf-8')
|
||||
|
||||
parser = WiktionaryLatestVersionParser(current_version)
|
||||
|
||||
with urlopen('https://dumps.wikimedia.org/enwiktionary/') as resp:
|
||||
parser.feed(resp.read().decode('utf-8'))
|
||||
|
||||
print(parser.latest_version)
|
||||
7
pkgs/servers/dict/wiktionary/update.sh
Executable file
7
pkgs/servers/dict/wiktionary/update.sh
Executable file
|
|
@ -0,0 +1,7 @@
|
|||
#! /usr/bin/env nix-shell
|
||||
#! nix-shell -i bash -p common-updater-scripts python3
|
||||
|
||||
set -ueo pipefail
|
||||
|
||||
version="$(python "$(dirname "${BASH_SOURCE[0]}")"/latest_version.py)"
|
||||
update-source-version dictdDBs.wiktionary "$version"
|
||||
778
pkgs/servers/dict/wiktionary/wiktionary2dict.py
Normal file
778
pkgs/servers/dict/wiktionary/wiktionary2dict.py
Normal file
|
|
@ -0,0 +1,778 @@
|
|||
# Adapted to produce DICT-compatible files by Petr Rockai in 2012
|
||||
# Based on code from wiktiondict by Greg Hewgill
|
||||
import re
|
||||
import sys
|
||||
import codecs
|
||||
import os
|
||||
import textwrap
|
||||
import time
|
||||
import xml.sax
|
||||
|
||||
class Text:
|
||||
def __init__(self, s):
|
||||
self.s = s
|
||||
def process(self):
|
||||
return s
|
||||
|
||||
class TemplateCall:
|
||||
def __init__(self):
|
||||
pass
|
||||
def process(self):
|
||||
pass
|
||||
|
||||
class Template:
|
||||
def __init__(self):
|
||||
self.parts = []
|
||||
def append(self, part):
|
||||
self.parts.append(part)
|
||||
def process(self):
|
||||
return ''.join(x.process() for x in self.parts)
|
||||
|
||||
class Whitespace:
|
||||
def __init__(self, s):
|
||||
self.s = s
|
||||
|
||||
class OpenDouble: pass
|
||||
class OpenTriple: pass
|
||||
class CloseDouble: pass
|
||||
class CloseTriple: pass
|
||||
|
||||
class Equals:
|
||||
def __str__(self):
|
||||
return "="
|
||||
|
||||
class Delimiter:
|
||||
def __init__(self, c):
|
||||
self.c = c
|
||||
def __str__(self):
|
||||
return self.c
|
||||
|
||||
def Tokenise(s):
|
||||
s = unicode(s)
|
||||
stack = []
|
||||
last = 0
|
||||
i = 0
|
||||
while i < len(s):
|
||||
if s[i] == '{' and i+1 < len(s) and s[i+1] == '{':
|
||||
if i > last:
|
||||
yield s[last:i]
|
||||
if i+2 < len(s) and s[i+2] == '{':
|
||||
yield OpenTriple()
|
||||
stack.append(3)
|
||||
i += 3
|
||||
else:
|
||||
yield OpenDouble()
|
||||
stack.append(2)
|
||||
i += 2
|
||||
last = i
|
||||
elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}':
|
||||
if i > last:
|
||||
yield s[last:i]
|
||||
if len(stack) == 0:
|
||||
yield "}}"
|
||||
i += 2
|
||||
elif stack[-1] == 2:
|
||||
yield CloseDouble()
|
||||
i += 2
|
||||
stack.pop()
|
||||
elif i+2 < len(s) and s[i+2] == '}':
|
||||
yield CloseTriple()
|
||||
i += 3
|
||||
stack.pop()
|
||||
else:
|
||||
raise SyntaxError()
|
||||
last = i
|
||||
elif s[i] == ':' or s[i] == '|':
|
||||
if i > last:
|
||||
yield s[last:i]
|
||||
yield Delimiter(s[i])
|
||||
i += 1
|
||||
last = i
|
||||
elif s[i] == '=':
|
||||
if i > last:
|
||||
yield s[last:i]
|
||||
yield Equals()
|
||||
i += 1
|
||||
last = i
|
||||
#elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
|
||||
# if i > last:
|
||||
# yield s[last:i]
|
||||
# last = i
|
||||
# m = re.match(r"\s+", s[i:])
|
||||
# assert m
|
||||
# yield Whitespace(m.group(0))
|
||||
# i += len(m.group(0))
|
||||
# last = i
|
||||
else:
|
||||
i += 1
|
||||
if i > last:
|
||||
yield s[last:i]
|
||||
|
||||
def processSub(templates, tokens, args):
|
||||
t = tokens.next()
|
||||
if not isinstance(t, unicode):
|
||||
raise SyntaxError
|
||||
name = t
|
||||
t = tokens.next()
|
||||
default = None
|
||||
if isinstance(t, Delimiter) and t.c == '|':
|
||||
default = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, unicode):
|
||||
default += t
|
||||
elif isinstance(t, OpenDouble):
|
||||
default += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
default += processSub(templates, tokens, args)
|
||||
elif isinstance(t, CloseTriple):
|
||||
break
|
||||
else:
|
||||
print "Unexpected:", t
|
||||
raise SyntaxError()
|
||||
if name in args:
|
||||
return args[name]
|
||||
if default is not None:
|
||||
return default
|
||||
if name == "lang":
|
||||
return "en"
|
||||
return "{{{%s}}}" % name
|
||||
|
||||
def processTemplateCall(templates, tokens, args):
|
||||
template = tokens.next().strip().lower()
|
||||
args = {}
|
||||
a = 1
|
||||
t = tokens.next()
|
||||
while True:
|
||||
if isinstance(t, Delimiter):
|
||||
name = unicode(a)
|
||||
arg = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, unicode):
|
||||
arg += t
|
||||
elif isinstance(t, OpenDouble):
|
||||
arg += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
arg += processSub(templates, tokens, args)
|
||||
elif isinstance(t, Delimiter) and t.c != '|':
|
||||
arg += str(t)
|
||||
else:
|
||||
break
|
||||
if isinstance(t, Equals):
|
||||
name = arg.strip()
|
||||
arg = ""
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, (unicode, Equals)):
|
||||
arg += unicode(t)
|
||||
elif isinstance(t, OpenDouble):
|
||||
arg += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
arg += processSub(templates, tokens, args)
|
||||
elif isinstance(t, Delimiter) and t.c != '|':
|
||||
arg += str(t)
|
||||
else:
|
||||
break
|
||||
arg = arg.strip()
|
||||
else:
|
||||
a += 1
|
||||
args[name] = arg
|
||||
elif isinstance(t, CloseDouble):
|
||||
break
|
||||
else:
|
||||
print "Unexpected:", t
|
||||
raise SyntaxError
|
||||
#print template, args
|
||||
if template[0] == '#':
|
||||
if template == "#if":
|
||||
if args['1'].strip():
|
||||
return args['2']
|
||||
elif '3' in args:
|
||||
return args['3']
|
||||
else:
|
||||
return ""
|
||||
elif template == "#ifeq":
|
||||
if args['1'].strip() == args['2'].strip():
|
||||
return args['3']
|
||||
elif '4' in args:
|
||||
return args['4']
|
||||
else:
|
||||
return ""
|
||||
elif template == "#ifexist":
|
||||
return ""
|
||||
elif template == "#switch":
|
||||
sw = args['1'].strip()
|
||||
if sw in args:
|
||||
return args[sw]
|
||||
else:
|
||||
return ""
|
||||
else:
|
||||
print "Unknown ParserFunction:", template
|
||||
sys.exit(1)
|
||||
if template not in templates:
|
||||
return "{{%s}}" % template
|
||||
return process(templates, templates[template], args)
|
||||
|
||||
def process(templates, s, args = {}):
|
||||
s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s)
|
||||
s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s)
|
||||
assert "<onlyinclude>" not in s
|
||||
#s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
|
||||
s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s)
|
||||
r = ""
|
||||
#print list(Tokenise(s))
|
||||
tokens = Tokenise(s)
|
||||
try:
|
||||
while True:
|
||||
t = tokens.next()
|
||||
if isinstance(t, OpenDouble):
|
||||
r += processTemplateCall(templates, tokens, args)
|
||||
elif isinstance(t, OpenTriple):
|
||||
r += processSub(templates, tokens, args)
|
||||
else:
|
||||
r += unicode(t)
|
||||
except StopIteration:
|
||||
pass
|
||||
return r
|
||||
|
||||
def test():
|
||||
templates = {
|
||||
'lb': "{{",
|
||||
'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
|
||||
't': "start-{{{1|pqr}}}-end",
|
||||
't0': "start-{{{1}}}-end",
|
||||
't1': "start{{{1}}}end<noinclude>moo</noinclude>",
|
||||
't2a1': "{{t2demo|a|{{{1}}}}}",
|
||||
't2a2': "{{t2demo|a|2={{{1}}}}}",
|
||||
't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
|
||||
't5': "{{t2demo|{{{a}}}=b}}",
|
||||
't6': "t2demo|a",
|
||||
}
|
||||
def t(text, expected):
|
||||
print "text:", text
|
||||
s = process(templates, text)
|
||||
if s != expected:
|
||||
print "got:", s
|
||||
print "expected:", expected
|
||||
sys.exit(1)
|
||||
t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
|
||||
t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
|
||||
t("{{t0|a}}", "start-a-end")
|
||||
t("{{t0| }}", "start- -end")
|
||||
t("{{t0|}}", "start--end")
|
||||
t("{{t0}}", "start-{{{1}}}-end")
|
||||
t("{{t0| }}", "start- -end")
|
||||
t("{{t0|\n}}", "start-\n-end")
|
||||
t("{{t0|1= }}", "start--end")
|
||||
t("{{t0|1=\n}}", "start--end")
|
||||
t("{{T}}", "start-pqr-end")
|
||||
t("{{T|}}", "start--end")
|
||||
t("{{T|abc}}", "start-abc-end")
|
||||
t("{{T|abc|def}}", "start-abc-end")
|
||||
t("{{T|1=abc|1=def}}", "start-def-end")
|
||||
t("{{T|abc|1=def}}", "start-def-end")
|
||||
t("{{T|1=abc|def}}", "start-def-end")
|
||||
t("{{T|{{T}}}}", "start-start-pqr-end-end")
|
||||
t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
|
||||
t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
|
||||
t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
|
||||
t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
|
||||
t("{{T|a=b}}", "start-pqr-end")
|
||||
t("{{T|1=a=b}}", "start-a=b-end")
|
||||
#t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
|
||||
#t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
|
||||
#t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
|
||||
#t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
|
||||
#t("{{ {{t6}} }}", "{{ t2demo|a }}")
|
||||
t("{{t|[[a|b]]}}", "start-b-end")
|
||||
t("{{t|[[a|b]] }}", "start-b -end")
|
||||
|
||||
Parts = {
|
||||
# Standard POS headers
|
||||
'noun': "n.",
|
||||
'Noun': "n.",
|
||||
'Noun 1': "n.",
|
||||
'Noun 2': "n.",
|
||||
'Verb': "v.",
|
||||
'Adjective': "adj.",
|
||||
'Adverb': "adv.",
|
||||
'Pronoun': "pron.",
|
||||
'Conjunction': "conj.",
|
||||
'Interjection': "interj.",
|
||||
'Preposition': "prep.",
|
||||
'Proper noun': "n.p.",
|
||||
'Proper Noun': "n.p.",
|
||||
'Article': "art.",
|
||||
|
||||
# Standard non-POS level 3 headers
|
||||
'{{acronym}}': "acr.",
|
||||
'Acronym': "acr.",
|
||||
'{{abbreviation}}': "abbr.",
|
||||
'[[Abbreviation]]': "abbr.",
|
||||
'Abbreviation': "abbr.",
|
||||
'[[initialism]]': "init.",
|
||||
'{{initialism}}': "init.",
|
||||
'Initialism': "init.",
|
||||
'Contraction': "cont.",
|
||||
'Prefix': "prefix",
|
||||
'Suffix': "suffix",
|
||||
'Symbol': "sym.",
|
||||
'Letter': "letter",
|
||||
'Idiom': "idiom",
|
||||
'Idioms': "idiom",
|
||||
'Phrase': "phrase",
|
||||
|
||||
# Debated POS level 3 headers
|
||||
'Number': "num.",
|
||||
'Numeral': "num.",
|
||||
'Cardinal number': "num.",
|
||||
'Ordinal number': "num.",
|
||||
'Cardinal numeral': "num.",
|
||||
'Ordinal numeral': "num.",
|
||||
|
||||
# Other headers in use
|
||||
'Personal pronoun': "pers.pron.",
|
||||
'Adjective/Adverb': "adj./adv.",
|
||||
'Proper adjective': "prop.adj.",
|
||||
'Determiner': "det.",
|
||||
'Demonstrative determiner': "dem.det.",
|
||||
'Clitic': "clitic",
|
||||
'Infix': "infix",
|
||||
'Counter': "counter",
|
||||
'Kanji': None,
|
||||
'Kanji reading': None,
|
||||
'Hiragana letter': None,
|
||||
'Katakana letter': None,
|
||||
'Pinyin': None,
|
||||
'Han character': None,
|
||||
'Hanzi': None,
|
||||
'Hanja': None,
|
||||
'Proverb': "prov.",
|
||||
'Expression': None,
|
||||
'Adjectival noun': None,
|
||||
'Quasi-adjective': None,
|
||||
'Particle': "part.",
|
||||
'Infinitive particle': "part.",
|
||||
'Possessive adjective': "poss.adj.",
|
||||
'Verbal prefix': "v.p.",
|
||||
'Postposition': "post.",
|
||||
'Prepositional article': "prep.art.",
|
||||
'Phrasal verb': "phr.v.",
|
||||
'Participle': "participle",
|
||||
'Interrogative auxiliary verb': "int.aux.v.",
|
||||
'Pronominal adverb': "pron.adv.",
|
||||
'Adnominal': "adn.",
|
||||
'Abstract pronoun': "abs.pron.",
|
||||
'Conjunction particle': None,
|
||||
'Root': "root",
|
||||
|
||||
# Non-standard, deprecated headers
|
||||
'Noun form': "n.",
|
||||
'Verb form': "v.",
|
||||
'Adjective form': "adj.form.",
|
||||
'Nominal phrase': "nom.phr.",
|
||||
'Noun phrase': "n. phrase",
|
||||
'Verb phrase': "v. phrase",
|
||||
'Transitive verb': "v.t.",
|
||||
'Intransitive verb': "v.i.",
|
||||
'Reflexive verb': "v.r.",
|
||||
'Cmavo': None,
|
||||
'Romaji': "rom.",
|
||||
'Hiragana': None,
|
||||
'Furigana': None,
|
||||
'Compounds': None,
|
||||
|
||||
# Other headers seen
|
||||
'Alternative forms': None,
|
||||
'Alternative spellings': None,
|
||||
'Anagrams': None,
|
||||
'Antonym': None,
|
||||
'Antonyms': None,
|
||||
'Conjugation': None,
|
||||
'Declension': None,
|
||||
'Declension and pronunciations': None,
|
||||
'Definite Article': "def.art.",
|
||||
'Definite article': "def.art.",
|
||||
'Demonstrative pronoun': "dem.pron.",
|
||||
'Derivation': None,
|
||||
'Derived expression': None,
|
||||
'Derived expressions': None,
|
||||
'Derived forms': None,
|
||||
'Derived phrases': None,
|
||||
'Derived terms': None,
|
||||
'Derived, Related terms': None,
|
||||
'Descendants': None,
|
||||
#'Etymology': None,
|
||||
#'Etymology 1': None,
|
||||
#'Etymology 2': None,
|
||||
#'Etymology 3': None,
|
||||
#'Etymology 4': None,
|
||||
#'Etymology 5': None,
|
||||
'Examples': None,
|
||||
'External links': None,
|
||||
'[[Gismu]]': None,
|
||||
'Gismu': None,
|
||||
'Homonyms': None,
|
||||
'Homophones': None,
|
||||
'Hyphenation': None,
|
||||
'Indefinite article': "art.",
|
||||
'Indefinite pronoun': "ind.pron.",
|
||||
'Indefinite Pronoun': "ind.pron.",
|
||||
'Indetermined pronoun': "ind.pron.",
|
||||
'Interrogative conjunction': "int.conj.",
|
||||
'Interrogative determiner': "int.det.",
|
||||
'Interrogative particle': "int.part.",
|
||||
'Interrogative pronoun': "int.pron.",
|
||||
'Legal expression': "legal",
|
||||
'Mass noun': "n.",
|
||||
'Miscellaneous': None,
|
||||
'Mutations': None,
|
||||
'Noun and verb': "n/v.",
|
||||
'Other language': None,
|
||||
'Pinyin syllable': None,
|
||||
'Possessive determiner': "poss.det.",
|
||||
'Possessive pronoun': "poss.pron.",
|
||||
'Prepositional phrase': "prep.phr.",
|
||||
'Prepositional Pronoun': "prep.pron.",
|
||||
'Pronunciation': None,
|
||||
'Pronunciation 1': None,
|
||||
'Pronunciation 2': None,
|
||||
'Quotations': None,
|
||||
'References': None,
|
||||
'Reflexive pronoun': "refl.pron.",
|
||||
'Related expressions': None,
|
||||
'Related terms': None,
|
||||
'Related words': None,
|
||||
'Relative pronoun': "rel.pron.",
|
||||
'Saying': "saying",
|
||||
'See also': None,
|
||||
'Shorthand': None,
|
||||
'[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
|
||||
'Sister projects': None,
|
||||
'Spelling note': None,
|
||||
'Synonyms': None,
|
||||
'Translation': None,
|
||||
'Translations': None,
|
||||
'Translations to be checked': None,
|
||||
'Transliteration': None,
|
||||
'Trivia': None,
|
||||
'Usage': None,
|
||||
'Usage in English': None,
|
||||
'Usage notes': None,
|
||||
'Verbal noun': "v.n.",
|
||||
}
|
||||
PartsUsed = {}
|
||||
for p in Parts.keys():
|
||||
PartsUsed[p] = 0
|
||||
|
||||
def encode(s):
|
||||
r = e(s)
|
||||
assert r[1] == len(s)
|
||||
return r[0]
|
||||
|
||||
def dowikilink(m):
|
||||
a = m.group(1).split("|")
|
||||
if len(a) > 1:
|
||||
link = a[1]
|
||||
else:
|
||||
link = a[0]
|
||||
if ':' in link:
|
||||
link = ""
|
||||
return link
|
||||
|
||||
seentemplates = {}
|
||||
def dotemplate(m):
|
||||
aa = m.group(1).split("|")
|
||||
args = {}
|
||||
n = 0
|
||||
for a in aa:
|
||||
am = re.match(r"(.*?)(=(.*))?", a)
|
||||
if am:
|
||||
args[am.group(1)] = am.group(3)
|
||||
else:
|
||||
n += 1
|
||||
args[n] = am.group(1)
|
||||
|
||||
#if aa[0] in seentemplates:
|
||||
# seentemplates[aa[0]] += 1
|
||||
#else:
|
||||
# seentemplates[aa[0]] = 1
|
||||
# print len(seentemplates), aa[0]
|
||||
#print aa[0]
|
||||
|
||||
#if aa[0] not in Templates:
|
||||
# return "(unknown template %s)" % aa[0]
|
||||
#body = Templates[aa[0]]
|
||||
#body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
|
||||
#assert "<onlyinclude>" not in body
|
||||
##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
|
||||
#body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
|
||||
#def dotemplatearg(m):
|
||||
# ta = m.group(1).split("|")
|
||||
# if ta[0] in args:
|
||||
# return args[ta[0]]
|
||||
# elif len(ta) > 1:
|
||||
# return ta[1]
|
||||
# else:
|
||||
# return "{{{%s}}}" % ta[0]
|
||||
#body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
|
||||
#return dewiki(body)
|
||||
|
||||
def doparserfunction(m):
|
||||
a = m.group(2).split("|")
|
||||
if m.group(1) == "ifeq":
|
||||
if a[0] == a[1]:
|
||||
return a[2]
|
||||
elif len(a) >= 4:
|
||||
return a[3]
|
||||
return ""
|
||||
|
||||
def dewiki(body, indent = 0):
|
||||
# process in this order:
|
||||
# {{{ }}}
|
||||
# <> <>
|
||||
# [[ ]]
|
||||
# {{ }}
|
||||
# ''' '''
|
||||
# '' ''
|
||||
#body = wikimediatemplate.process(Templates, body)
|
||||
body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body)
|
||||
#body = re.sub(r"{{(.*?)}}", dotemplate, body)
|
||||
#body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
|
||||
body = re.sub(r"'''(.*?)'''", r"\1", body)
|
||||
body = re.sub(r"''(.*?)''", r"\1", body)
|
||||
lines = body.split("\n")
|
||||
n = 0
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
if len(lines[i]) > 0 and lines[i][0] == "#":
|
||||
if len(lines[i]) > 1 and lines[i][1] == '*':
|
||||
wlines = textwrap.wrap(lines[i][2:].strip(),
|
||||
initial_indent = " * ",
|
||||
subsequent_indent = " ")
|
||||
elif len(lines[i]) > 1 and lines[i][1] == ':':
|
||||
wlines = textwrap.wrap(lines[i][2:].strip(),
|
||||
initial_indent = " ",
|
||||
subsequent_indent = " ")
|
||||
else:
|
||||
n += 1
|
||||
wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(),
|
||||
subsequent_indent = " ")
|
||||
elif len(lines[i]) > 0 and lines[i][0] == "*":
|
||||
n = 0
|
||||
wlines = textwrap.wrap(lines[i][1:].strip(),
|
||||
initial_indent = "* ",
|
||||
subsequent_indent = " ")
|
||||
else:
|
||||
n = 0
|
||||
wlines = textwrap.wrap(lines[i].strip())
|
||||
if len(wlines) == 0:
|
||||
wlines = ['']
|
||||
lines[i:i+1] = wlines
|
||||
i += len(wlines)
|
||||
return ''.join(" "*(indent-1)+x+"\n" for x in lines)
|
||||
|
||||
class WikiSection:
|
||||
def __init__(self, heading, body):
|
||||
self.heading = heading
|
||||
self.body = body
|
||||
#self.lines = re.split("\n+", body.strip())
|
||||
#if len(self.lines) == 1 and len(self.lines[0]) == 0:
|
||||
# self.lines = []
|
||||
self.children = []
|
||||
def __str__(self):
|
||||
return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children]))
|
||||
def add(self, section):
|
||||
self.children.append(section)
|
||||
|
||||
def parse(word, text):
|
||||
headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE))
|
||||
#print [x.group(1) for x in headings]
|
||||
doc = WikiSection(word, "")
|
||||
stack = [doc]
|
||||
for i, m in enumerate(headings):
|
||||
depth = len(m.group(1))
|
||||
if depth < len(stack):
|
||||
stack = stack[:depth]
|
||||
else:
|
||||
while depth > len(stack):
|
||||
s = WikiSection(None, "")
|
||||
stack[-1].add(s)
|
||||
stack.append(s)
|
||||
if i+1 < len(headings):
|
||||
s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip())
|
||||
else:
|
||||
s = WikiSection(m.group(2), text[m.end(0):].strip())
|
||||
assert len(stack) == depth
|
||||
stack[-1].add(s)
|
||||
stack.append(s)
|
||||
#while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
|
||||
# doc = doc.children[0]
|
||||
return doc
|
||||
|
||||
def formatFull(word, doc):
|
||||
def f(depth, section):
|
||||
if section.heading:
|
||||
r = " "*(depth-1) + section.heading + "\n\n"
|
||||
else:
|
||||
r = ""
|
||||
if section.body:
|
||||
r += dewiki(section.body, depth+1)+"\n"
|
||||
#r += "".join(" "*depth + x + "\n" for x in dewiki(section.body))
|
||||
#if len(section.lines) > 0:
|
||||
# r += "\n"
|
||||
for c in section.children:
|
||||
r += f(depth+1, c)
|
||||
return r
|
||||
s = f(0, doc)
|
||||
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
|
||||
return s
|
||||
|
||||
def formatNormal(word, doc):
|
||||
def f(depth, posdepth, section):
|
||||
r = ""
|
||||
if depth == posdepth:
|
||||
if not section.heading or section.heading.startswith("Etymology"):
|
||||
posdepth += 1
|
||||
elif section.heading in Parts:
|
||||
#p = Parts[section.heading]
|
||||
#if p:
|
||||
# r += " "*(depth-1) + word + " (" + p + ")\n\n"
|
||||
r += " "*(depth-1) + section.heading + "\n\n"
|
||||
else:
|
||||
print >>errors, "Unknown part: (%s) %s" % (word, section.heading)
|
||||
return ""
|
||||
elif depth > posdepth:
|
||||
return ""
|
||||
elif section.heading:
|
||||
r += " "*(depth-1) + section.heading + "\n\n"
|
||||
if section.body:
|
||||
r += dewiki(section.body, depth+1)+"\n"
|
||||
#r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines))
|
||||
#if len(section.lines) > 0:
|
||||
# r += "\n"
|
||||
for c in section.children:
|
||||
r += f(depth+1, posdepth, c)
|
||||
return r
|
||||
s = f(0, 3, doc)
|
||||
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
|
||||
return s
|
||||
|
||||
def formatBrief(word, doc):
|
||||
def f(depth, posdepth, section):
|
||||
if depth == posdepth:
|
||||
h = section.heading
|
||||
if not section.heading or section.heading.startswith("Etymology"):
|
||||
posdepth += 1
|
||||
elif section.heading in Parts:
|
||||
#h = Parts[section.heading]
|
||||
#if h:
|
||||
# h = "%s (%s)" % (word, h)
|
||||
pass
|
||||
stack.append([h, False])
|
||||
elif depth > 0:
|
||||
stack.append([section.heading, False])
|
||||
else:
|
||||
stack.append(["%h " + section.heading, False])
|
||||
r = ""
|
||||
#if section.heading:
|
||||
# r += " "*(depth-1) + section.heading + "\n"
|
||||
body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#')
|
||||
if len(body) > 0:
|
||||
for i in range(len(stack)):
|
||||
if not stack[i][1]:
|
||||
if stack[i][0]:
|
||||
r += " "*(i-1) + stack[i][0] + "\n"
|
||||
stack[i][1] = True
|
||||
r += dewiki(body, depth+1)
|
||||
for c in section.children:
|
||||
r += f(depth+1, posdepth, c)
|
||||
stack.pop()
|
||||
return r
|
||||
stack = []
|
||||
s = f(0, 3, doc)
|
||||
s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
|
||||
return s
|
||||
|
||||
class WikiHandler(xml.sax.ContentHandler):
|
||||
def __init__(self):
|
||||
self.element = None
|
||||
self.page = None
|
||||
self.text = ""
|
||||
self.long = {}
|
||||
def startElement(self, name, attrs):
|
||||
#print "start", name, attrs
|
||||
self.element = name
|
||||
def endElement(self, name):
|
||||
#print "end", name
|
||||
if self.element == "text":
|
||||
if self.page:
|
||||
if self.page in self.long:
|
||||
print self.page, len(self.text)
|
||||
print
|
||||
self.doPage(self.page, self.text)
|
||||
self.page = None
|
||||
self.text = ""
|
||||
self.element = None
|
||||
def characters(self, content):
|
||||
#print "characters", content
|
||||
if self.element == "title":
|
||||
if self.checkPage(content):
|
||||
self.page = content
|
||||
elif self.element == "text":
|
||||
if self.page:
|
||||
self.text += content
|
||||
if len(self.text) > 100000 and self.page not in self.long:
|
||||
self.long[self.page] = 1
|
||||
def checkPage(self, page):
|
||||
return False
|
||||
def doPage(self, page, text):
|
||||
pass
|
||||
|
||||
class TemplateHandler(WikiHandler):
|
||||
def checkPage(self, page):
|
||||
return page.startswith("Template:")
|
||||
def doPage(self, page, text):
|
||||
Templates[page[page.find(':')+1:].lower()] = text
|
||||
|
||||
class WordHandler(WikiHandler):
|
||||
def checkPage(self, page):
|
||||
return ':' not in page
|
||||
def doPage(self, page, text):
|
||||
m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE)
|
||||
if m:
|
||||
out.write(" See <%s>" % page)
|
||||
return
|
||||
doc = parse(page, text)
|
||||
out.write(formatBrief(page, doc))
|
||||
#print formatBrief(page, doc)
|
||||
|
||||
fn = sys.argv[1]
|
||||
info = """ This file was converted from the original database on:
|
||||
%s
|
||||
|
||||
The original data is available from:
|
||||
http://en.wiktionary.org
|
||||
The version from which this file was generated was:
|
||||
%s
|
||||
|
||||
Wiktionary is available under the GNU Free Documentation License.
|
||||
""" % (time.ctime(), os.path.basename(fn))
|
||||
|
||||
errors = codecs.open("mkdict.err", "w", "utf_8")
|
||||
e = codecs.getencoder("utf_8")
|
||||
|
||||
Templates = {}
|
||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||
xml.sax.parse(f, TemplateHandler())
|
||||
f.close()
|
||||
|
||||
f = os.popen("bunzip2 -c %s" % fn, "r")
|
||||
out = codecs.getwriter("utf_8")(
|
||||
os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w"))
|
||||
|
||||
out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8'))
|
||||
xml.sax.parse(f, WordHandler())
|
||||
f.close()
|
||||
out.close()
|
||||
319
pkgs/servers/dict/wordnet_structures.py
Normal file
319
pkgs/servers/dict/wordnet_structures.py
Normal file
|
|
@ -0,0 +1,319 @@
|
|||
#!/usr/bin/env python
|
||||
#Copyright 2007 Sebastian Hagen
|
||||
# This file is part of wordnet_tools.
|
||||
|
||||
# wordnet_tools is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2
|
||||
# as published by the Free Software Foundation
|
||||
|
||||
# wordnet_tools is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with wordnet_tools; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
# This program requires python >= 2.4.
|
||||
|
||||
# This program converts wordnet index/data file pairs into dict index/data
|
||||
# files usable by dictd.
|
||||
# This is basically a reimplementation of the wnfilter program by Rik Faith,
|
||||
# which unfortunately doesn't work correctly for wordnet files in the newer
|
||||
# formats. This version of wordnet_structures whould parse wordnet 2.1 files
|
||||
# correctly, and create output very similar to what wnfilter would have
|
||||
# written.
|
||||
|
||||
import datetime
|
||||
from textwrap import TextWrapper
|
||||
|
||||
CAT_ADJECTIVE = 0
|
||||
CAT_ADVERB = 1
|
||||
CAT_NOUN = 2
|
||||
CAT_VERB = 3
|
||||
|
||||
category_map = {
|
||||
'n': CAT_NOUN,
|
||||
'v': CAT_VERB,
|
||||
'a': CAT_ADJECTIVE,
|
||||
's': CAT_ADJECTIVE,
|
||||
'r': CAT_ADVERB
|
||||
}
|
||||
|
||||
|
||||
class WordIndex:
|
||||
def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
|
||||
self.lemma = lemma
|
||||
self.category = category
|
||||
self.ptrs = ptrs
|
||||
self.synsets = synsets
|
||||
self.tagsense_count = tagsense_count
|
||||
|
||||
@classmethod
|
||||
def build_from_line(cls, line_data, synset_map):
|
||||
line_split = line_data.split()
|
||||
lemma = line_split[0]
|
||||
category = category_map[line_split[1]]
|
||||
synset_count = int(line_split[2],10)
|
||||
ptr_count = int(line_split[3],10)
|
||||
ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
|
||||
tagsense_count = int(line_split[5 + ptr_count],10)
|
||||
synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
|
||||
return cls(lemma, category, ptrs, synsets, tagsense_count)
|
||||
|
||||
@classmethod
|
||||
def build_from_file(cls, f, synset_map, rv_base=None):
|
||||
if (rv_base is None):
|
||||
rv = {}
|
||||
else:
|
||||
rv = rv_base
|
||||
|
||||
for line in f:
|
||||
if (line.startswith(' ')):
|
||||
continue
|
||||
wi = cls.build_from_line(line, synset_map)
|
||||
word = wi.lemma.lower()
|
||||
if not (word in rv):
|
||||
rv[word] = []
|
||||
rv[word].append(wi)
|
||||
return rv
|
||||
|
||||
def __repr__(self):
|
||||
return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
|
||||
|
||||
|
||||
class WordIndexDictFormatter(WordIndex):
|
||||
category_map_rev = {
|
||||
CAT_NOUN: 'n',
|
||||
CAT_VERB: 'v',
|
||||
CAT_ADJECTIVE: 'adj',
|
||||
CAT_ADVERB: 'adv'
|
||||
}
|
||||
linesep = '\n'
|
||||
LINE_WIDTH_MAX = 68
|
||||
prefix_fmtf_line_first = '%5s 1: '
|
||||
prefix_fmtn_line_first = ' '
|
||||
prefix_fmtf_line_nonfirst = '%5d: '
|
||||
prefix_fmtn_line_nonfirst = ' '
|
||||
|
||||
def dict_str(self):
|
||||
tw = TextWrapper(width=self.LINE_WIDTH_MAX,
|
||||
initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
|
||||
subsequent_indent=self.prefix_fmtn_line_first)
|
||||
|
||||
lines = (tw.wrap(self.synsets[0].dict_str()))
|
||||
i = 2
|
||||
for synset in self.synsets[1:]:
|
||||
tw = TextWrapper(width=self.LINE_WIDTH_MAX,
|
||||
initial_indent=(self.prefix_fmtf_line_nonfirst % i),
|
||||
subsequent_indent=self.prefix_fmtn_line_nonfirst)
|
||||
lines.extend(tw.wrap(synset.dict_str()))
|
||||
i += 1
|
||||
return self.linesep.join(lines)
|
||||
|
||||
|
||||
class Synset:
|
||||
def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
|
||||
self.offset = offset
|
||||
self.type = ss_type
|
||||
self.words = words
|
||||
self.ptrs = ptrs
|
||||
self.gloss = gloss
|
||||
self.frames = frames
|
||||
self.comments = []
|
||||
|
||||
@classmethod
|
||||
def build_from_line(cls, line_data):
|
||||
line_split = line_data.split()
|
||||
synset_offset = int(line_split[0],10)
|
||||
ss_type = category_map[line_split[2]]
|
||||
word_count = int(line_split[3],16)
|
||||
words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
|
||||
ptr_count = int(line_split[4 + word_count*2],10)
|
||||
ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
|
||||
|
||||
tok = line_split[5 + word_count*2 + ptr_count*4]
|
||||
base = 6 + word_count*2 + ptr_count*4
|
||||
if (tok != '|'):
|
||||
frame_count = int(tok, 10)
|
||||
frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
|
||||
base += frame_count*3 + 1
|
||||
else:
|
||||
frames = []
|
||||
|
||||
line_split2 = line_data.split(None, base)
|
||||
if (len(line_split2) < base):
|
||||
gloss = None
|
||||
else:
|
||||
gloss = line_split2[-1]
|
||||
|
||||
return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
|
||||
|
||||
@classmethod
|
||||
def build_from_file(cls, f):
|
||||
rv = {}
|
||||
comments = []
|
||||
|
||||
for line in f:
|
||||
if (line.startswith(' ')):
|
||||
line_s = line.lstrip().rstrip('\n')
|
||||
line_elements = line_s.split(None,1)
|
||||
try:
|
||||
int(line_elements[0])
|
||||
except ValueError:
|
||||
continue
|
||||
if (len(line_elements) == 1):
|
||||
line_elements.append('')
|
||||
comments.append(line_elements[1])
|
||||
continue
|
||||
synset = cls.build_from_line(line.rstrip())
|
||||
rv[synset.offset] = synset
|
||||
|
||||
return (rv, comments)
|
||||
|
||||
def dict_str(self):
|
||||
rv = self.gloss
|
||||
if (len(self.words) > 1):
|
||||
rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
|
||||
return rv
|
||||
|
||||
def __repr__(self):
|
||||
return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
|
||||
|
||||
|
||||
class WordnetDict:
|
||||
db_info_fmt = '''This file was converted from the original database on:
|
||||
%(conversion_datetime)s
|
||||
|
||||
The original data is available from:
|
||||
%(wn_url)s
|
||||
|
||||
The original data was distributed with the notice shown below. No
|
||||
additional restrictions are claimed. Please redistribute this changed
|
||||
version under the same conditions and restriction that apply to the
|
||||
original version.\n\n
|
||||
%(wn_license)s'''
|
||||
|
||||
datetime_fmt = '%Y-%m-%dT%H:%M:%S'
|
||||
base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
|
||||
|
||||
def __init__(self, wn_url, desc_short, desc_long):
|
||||
self.word_data = {}
|
||||
self.wn_url = wn_url
|
||||
self.desc_short = desc_short
|
||||
self.desc_long = desc_long
|
||||
self.wn_license = None
|
||||
|
||||
def wn_dict_add(self, file_index, file_data):
|
||||
file_data.seek(0)
|
||||
file_index.seek(0)
|
||||
(synsets, license_lines) = Synset.build_from_file(file_data)
|
||||
WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
|
||||
if (license_lines):
|
||||
self.wn_license = '\n'.join(license_lines) + '\n'
|
||||
|
||||
@classmethod
|
||||
def base64_encode(cls, i):
|
||||
"""Encode a non-negative integer into a dictd compatible base64 string"""
|
||||
if (i < 0):
|
||||
raise ValueError('Value %r for i is negative' % (i,))
|
||||
r = 63
|
||||
e = 1
|
||||
while (r < i):
|
||||
e += 1
|
||||
r = 64**e - 1
|
||||
|
||||
rv = ''
|
||||
while (e > 0):
|
||||
e -= 1
|
||||
d = (i / 64**e)
|
||||
rv += cls.base64_map[d]
|
||||
i = i % (64**e)
|
||||
return rv
|
||||
|
||||
@classmethod
|
||||
def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
|
||||
"""Write a single dict entry for <key> to index and data files"""
|
||||
entry_start = file_data.tell()
|
||||
file_data.write(entry)
|
||||
entry_len = len(entry)
|
||||
file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
|
||||
cls.base64_encode(entry_len), linesep))
|
||||
|
||||
def dict_generate(self, file_index, file_data):
|
||||
file_index.seek(0)
|
||||
file_data.seek(0)
|
||||
# The dictd file format is fairly iffy on the subject of special
|
||||
# headwords: either dictd is buggy, or the manpage doesn't tell the whole
|
||||
# story about the format.
|
||||
# The upshot is that order of these entries in the index *matters*.
|
||||
# Putting them at the beginning and in alphabetic order is afaict ok.
|
||||
# Some other orders completely and quietly break the ability to look
|
||||
# those headwords up.
|
||||
# -- problem encountered with 1.10.2, at 2007-08-05.
|
||||
file_data.write('\n')
|
||||
wn_url = self.wn_url
|
||||
conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
|
||||
wn_license = self.wn_license
|
||||
self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
|
||||
self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
|
||||
self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
|
||||
self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
|
||||
|
||||
|
||||
words = self.word_data.keys()
|
||||
words.sort()
|
||||
for word in words:
|
||||
for wi in self.word_data[word]:
|
||||
word_cs = word
|
||||
# Use case-sensitivity information of first entry of first synset that
|
||||
# matches this word case-insensitively
|
||||
for synset in wi.synsets:
|
||||
for ss_word in synset.words:
|
||||
if (ss_word.lower() == word_cs.lower()):
|
||||
word_cs = ss_word
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break
|
||||
|
||||
outstr = ''
|
||||
for wi in self.word_data[word]:
|
||||
outstr += wi.dict_str() + '\n'
|
||||
|
||||
outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
|
||||
self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
|
||||
|
||||
file_index.truncate()
|
||||
file_data.truncate()
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
import optparse
|
||||
op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
|
||||
op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
|
||||
op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
|
||||
op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
|
||||
op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
|
||||
op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
|
||||
|
||||
(options, args) = op.parse_args()
|
||||
|
||||
wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
|
||||
|
||||
for i in range(0,len(args),2):
|
||||
print 'Opening index file %r...' % args[i]
|
||||
file_index = file(args[i])
|
||||
print 'Opening data file %r...' % args[i+1]
|
||||
file_data = file(args[i+1])
|
||||
print 'Parsing index file and data file...'
|
||||
wnd.wn_dict_add(file_index, file_data)
|
||||
|
||||
print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
|
||||
|
||||
wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
|
||||
print 'All done.'
|
||||
Loading…
Add table
Add a link
Reference in a new issue