Benutzer:Jah/ppDump.js
Zur Navigation springen
Zur Suche springen
Hinweis: Leere nach dem Veröffentlichen den Browser-Cache, um die Änderungen sehen zu können.
- Firefox/Safari: Umschalttaste drücken und gleichzeitig Aktualisieren anklicken oder entweder Strg+F5 oder Strg+R (⌘+R auf dem Mac) drücken
- Google Chrome: Umschalttaste+Strg+R (⌘+Umschalttaste+R auf dem Mac) drücken
- Edge: Strg+F5 drücken oder Strg drücken und gleichzeitig Aktualisieren anklicken
#!/usr/bin/perl -w
use DB_File;
use Compress::Zlib;
use Digest::MD5 qw(md5 md5_base64);
die "usage: ppDump lang project" unless @ARGV==2;
($lang, $project) = @ARGV; @ARGV = ();
$datadir = "data";
mkdir $datadir unless -d $datadir;
$revsPerBlock=50;
$makeNEdits=1;
$saveText=1;
$delayIdx=0;
# lws = length of word sequences; One Chinese character is treated as one word.
$lws = ($lang =~ /^(zh|ja|ko)$/)?8:5;
sub xmlunesc {
my $text = shift;
$text =~ s/</</sg;
$text =~ s/>/>/sg;
$text =~ s/'/'/sg;
$text =~ s/"/"/sg;
$text =~ s/&/&/sg;
$text;
}
open MW, ">$datadir/$lang$project.mw";
$mw = <>;
print MW $mw;
close MW;
sub xmlsiteinfo {
while($si =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) {
$namespace{$2} = $1 if defined $2;
$image = $2 if $1==6;
$category = $2 if $1==14;
}
open SITEINFO, ">$datadir/$lang$project.si";
print SITEINFO $si;
close SITEINFO;
}
while(<>) {
$si .= $_;
if(/^\s*<\/siteinfo/) {
xmlsiteinfo();
last;
}
}
use Inline C => <<'END';
#include "unicodeAttributes.h"
void DJBHashes(unsigned char* sec, int revNr, int secNr, HV* firstOccurrence, HV* secNrsBySeq, int lws) {
int wStart[lws], wEnd[lws]; // ring buffers: start and end(excl) of words
int pos=0, wNr=0;
int b1, b2, b3, b4, c, inWord=0, inCJK=0;
unsigned char hashHex[9];
do {
int pos0 = pos;
b1=sec[pos++];
if (b1<128) {
c = b1;
} else if(b1<192) {
continue;
} else if(b1<224) {
b2 = sec[pos++];
c = ((b1&31)<<6) | (b2&63);
} else if(b1<240) {
b2 = sec[pos++];
b3 = sec[pos++];
c = ((b1&15)<<12) | ((b2&63)<<6) | (b3&63);
} else if(b1<248) {
b2 = sec[pos++];
b3 = sec[pos++];
b4 = sec[pos++];
c = ((b1&7)<<18) | ((b2&63)<<12) | ((b3&63)<<6) | (b4&63);
} else {
continue;
}
if(c>=17*65536)
continue;
unsigned char generalCategory = unicodeGC[c] & 0x7f;
unsigned char isCJK = unicodeGC[c] >= 0x80;
int isAlNum =
generalCategory == unicode_Lu ||
generalCategory == unicode_Ll ||
generalCategory == unicode_Lt ||
generalCategory == unicode_Lm ||
generalCategory == unicode_Lo ||
generalCategory == unicode_Nd ||
generalCategory == unicode_Nl ||
generalCategory == unicode_No;
int newWord = 0;
if(inCJK) {
wEnd[wNr++%lws]=pos0;
inCJK=0;
newWord=1;
} else if(inWord && (!isAlNum || isCJK)) {
wEnd[wNr++%lws]=pos0;
inWord=0;
newWord=1;
}
if(newWord && wNr>=lws) {
unsigned int hash=5381;
int j;
for(j=wNr-lws; j<wNr; j++) {
int k;
for(k=wStart[j%lws]; k<wEnd[j%lws]; k++)
hash = 33*hash+sec[k];
if(j<wNr-1)
hash = 33*hash+' ';
}
SV* hashSV = sv_2mortal(newSViv(hash));
if(!hv_exists_ent(firstOccurrence, hashSV, 0))
hv_store_ent(firstOccurrence, hashSV, newSViv(revNr), 0);
AV* secNrs;
HE* secNrsHE = hv_fetch_ent(secNrsBySeq, hashSV, 0, 0);
if(!secNrsHE) {
secNrs = newAV();
hv_store_ent(secNrsBySeq, hashSV, newRV_noinc((SV*)secNrs), 0);
} else {
secNrs = (AV*)SvRV(HeVAL(secNrsHE));
}
av_push(secNrs, newSViv(secNr));
}
if(isCJK) {
wStart[wNr%lws] = pos0;
inCJK=1;
} else if(isAlNum && !inWord) {
wStart[wNr%lws] = pos0;
inWord=1;
}
} while(c!=0);
}
END
%entityToUnicode = (
zwnj => 8204, zwj => 8205, lrm => 8206, rlm => 8207,
nbsp => 160, iexcl => 161, cent => 162, pound => 163,
curren => 164, yen => 165, brvbar => 166, sect => 167,
uml => 168, copy => 169, ordf => 170, laquo => 171,
not => 172, shy => 173, reg => 174, macr => 175,
deg => 176, plusmn => 177, sup2 => 178, sup3 => 179,
acute => 180, micro => 181, para => 182, middot => 183,
cedil => 184, sup1 => 185, ordm => 186, raquo => 187,
frac14 => 188, frac12 => 189, frac34 => 190, iquest => 191,
Agrave => 192, Aacute => 193, Acirc => 194, Atilde => 195,
Auml => 196, Aring => 197, AElig => 198, Ccedil => 199,
Egrave => 200, Eacute => 201, Ecirc => 202, Euml => 203,
Igrave => 204, Iacute => 205, Icirc => 206, Iuml => 207,
ETH => 208, Ntilde => 209, Ograve => 210, Oacute => 211,
Ocirc => 212, Otilde => 213, Ouml => 214, times => 215,
Oslash => 216, Ugrave => 217, Uacute => 218, Ucirc => 219,
Uuml => 220, Yacute => 221, THORN => 222, szlig => 223,
agrave => 224, aacute => 225, acirc => 226, atilde => 227,
auml => 228, aring => 229, aelig => 230, ccedil => 231,
egrave => 232, eacute => 233, ecirc => 234, euml => 235,
igrave => 236, iacute => 237, icirc => 238, iuml => 239,
eth => 240, ntilde => 241, ograve => 242, oacute => 243,
ocirc => 244, otilde => 245, ouml => 246, divide => 247,
oslash => 248, ugrave => 249, uacute => 250, ucirc => 251,
uuml => 252, yacute => 253, thorn => 254, yuml => 255,
fnof => 402, Alpha => 913, Beta => 914, Gamma => 915,
Delta => 916, Epsilon => 917, Zeta => 918, Eta => 919,
Theta => 920, Iota => 921, Kappa => 922, Lambda => 923,
Mu => 924, Nu => 925, Xi => 926, Omicron => 927,
Pi => 928, Rho => 929, Sigma => 931, Tau => 932,
Upsilon => 933, Phi => 934, Chi => 935, Psi => 936,
Omega => 937, alpha => 945, beta => 946, gamma => 947,
delta => 948, epsilon => 949, zeta => 950, eta => 951,
theta => 952, iota => 953, kappa => 954, lambda => 955,
mu => 956, nu => 957, xi => 958, omicron => 959,
pi => 960, rho => 961, sigmaf => 962, sigma => 963,
tau => 964, upsilon => 965, phi => 966, chi => 967,
psi => 968, omega => 969, thetasym => 977, upsih => 978,
piv => 982, bull => 8226, hellip => 8230, prime => 8242,
Prime => 8243, oline => 8254, frasl => 8260, weierp => 8472,
image => 8465, real => 8476, trade => 8482, alefsym => 8501,
larr => 8592, uarr => 8593, rarr => 8594, darr => 8595,
harr => 8596, crarr => 8629, lArr => 8656, uArr => 8657,
rArr => 8658, dArr => 8659, hArr => 8660, forall => 8704,
part => 8706, exist => 8707, empty => 8709, nabla => 8711,
isin => 8712, notin => 8713, ni => 8715, prod => 8719,
sum => 8721, minus => 8722, lowast => 8727, radic => 8730,
prop => 8733, infin => 8734, ang => 8736, and => 8743,
or => 8744, cap => 8745, cup => 8746, int => 8747,
there4 => 8756, sim => 8764, cong => 8773, asymp => 8776,
ne => 8800, equiv => 8801, le => 8804, ge => 8805,
sub => 8834, sup => 8835, nsub => 8836, sube => 8838,
supe => 8839, oplus => 8853, otimes => 8855, perp => 8869,
sdot => 8901, lceil => 8968, rceil => 8969, lfloor => 8970,
rfloor => 8971, lang => 9001, rang => 9002, loz => 9674,
spades => 9824, clubs => 9827, hearts => 9829, diams => 9830,
quot => 34, amp => 38, lt => 60, gt => 62,
OElig => 338, oelig => 339, Scaron => 352, scaron => 353,
Yuml => 376, circ => 710, tilde => 732, ensp => 8194,
emsp => 8195, thinsp => 8201, zwnj => 8204, zwj => 8205,
lrm => 8206, rlm => 8207, ndash => 8211, mdash => 8212,
lsquo => 8216, rsquo => 8217, sbquo => 8218, ldquo => 8220,
rdquo => 8221, bdquo => 8222, dagger => 8224, Dagger => 8225,
permil => 8240, lsaquo => 8249, rsaquo => 8250, euro => 8364,
);
sub entityToUnicodeChar {
my $entity = shift;
if($entity =~ /&(\w+);/) {
if(defined $entityToUnicode{$1}) {
return pack "U", $entityToUnicode{$1};
} else {
return "";
}
} elsif($entity =~ /&#(\d+);/) {
if($1<65536) {
return chr $1;
} else {
return "";
}
} elsif($entity =~ /&#x([0-9A-Fa-f]+);/) {
my $ucVal = hex $1;
if($ucVal<65536) {
return pack "U", $ucVal;
} else {
return "";
}
}
}
sub findWordSequences {
my ($sec, $secNr, $revNr) = @_;
return if do { use bytes; length $sec } > 200000;
$sec = xmlunesc($sec);
# $sec =~ s/^=+.*?=+//;
$sec =~ s/\[\[(?:.{2,3}|minnan|simple):.+?\]\]//isg;
$sec =~ s/\[\[(?!(?:$category|$image|category|image):)([^\]\|]*?\|)?([^\|]+?)\]\]/$2/isg;
while($sec =~ s/\{\{((?!\{\{).)*?\}\}//sg) {}
while($sec =~ s/\{\|((?!\{\|).)*?\|\}//sg) {}
while($sec =~ s/<table((?!<table).)*?<\/table>//isg) {}
$sec =~ s/\[\[(?:$category|$image|category|image):.*?\]\]//isg;
$sec =~ s/\*? ?\[(?:http|ftp|mailto).*?\]//isg;
$sec =~ s/\*? ?(?:http|ftp|mailto):\S*//isg;
$sec =~ s/<math>.*?<\/math>//sg;
$sec =~ s/<(?:div|font|span).*?>//isg;
$sec =~ s/<.{1,10}?>/ /sg;
$sec =~ s/(&\w+|#(?:\d+|x[0-9A-Fa-f]+);)/entityToUnicodeChar($1)/esg;
DJBHashes($sec, $revNr, $secNr, \%firstOccurrence, \%secNrsBySeq, $lws);
}
sub updateLastOccurrences {
foreach my $seq (keys %secNrsBySeq) {
for(my $i=0; $i<@{$secNrsBySeq{$seq}}; $i++) {
my $lOtmp = $secLastOccurrence[$secNrsBySeq{$seq}[$i]];
$lastOccurrence{$seq} = $lOtmp if !defined($lastOccurrence{$seq}) || $lOtmp>$lastOccurrence{$seq};
}
}
}
sub flushSeqs {
$nSeqs = keys %firstOccurrence;
for my $seq (keys %firstOccurrence) {
print SEQ pack("N3", $seq, $revId[$firstOccurrence{$seq}], $revId[$lastOccurrence{$seq}]);
}
}
sub text {
my ($t1, $t2, $t3, $dontSplit, $revNr) = @_;
# return "$t1$t2$t3";
my $lenTxt = do { use bytes; length $t2 };
if(!$dontSplit) {
my @s = ();
@s = split(/\n(?===[^\n]*?==\r?\n)/, $t2);
my @sNrs = (); my @sLen = (); my @sMd5 = ();
for(my $i=0; $i<@s; $i++) {
$s[$i] .= "\n" if $i<@s-1;
my $smd5 = md5_base64 $s[$i];
push @sMd5, $smd5;
my $secTitle = "0,";
if($s[$i] =~ /^([=]{2,}) *([^\n]*?) *\1\r?\n/s) {
$secTitle = length($1).",$2";
}
if(defined $secNrByMd5->{$smd5}) {
push @sNrs, $secNrByMd5->{$smd5};
$secLastOccurrence[$secNrByMd5->{$smd5}] = $revNr;
} else {
my $sNr = @$secs;
push @sNrs, $sNr;
$secNrByMd5->{$smd5} = $sNr;
push @{$secNrsByTitle->{$secTitle}}, $sNr;
push @$secs, $s[$i];
push @$secsMd5, $smd5;
push @$secsTitle, $secTitle;
push @secLastOccurrence, $revNr;
findWordSequences($s[$i], @$secs-1, $revNr);
}
}
my $md5 = md5_base64(join(" ", @sMd5));
$t1 = "<text type=\"sectionlist\" length=\"$lenTxt\" md5=\"$md5\">";
$t2 = join(" ", @sNrs);
return "$t1$t2$t3";
} else {
my $md5 = md5_base64($t2);
return $txtByMd5{$md5} if defined $txtByMd5{$md5};
if($lenTxt>100) {
($gzTxt, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
($buf, $status) = $gzTxt->deflate($t2);
print TXT $buf if $saveText;
my $lenTxtGz = do { use bytes; length $buf };
($buf, $status) = $gzTxt->flush();
print TXT $buf if $saveText;
$lenTxtGz += do { use bytes; length $buf };
$t1 = "<text offset=\"$posTxt\" lengthGz=\"$lenTxtGz\" length=\"$lenTxt\" md5=\"$md5\" />";
} else {
print TXT $t2 if $saveText;
$t1 = "<text offset=\"$posTxt\" length=\"$lenTxt\" md5=\"$md5\" />";
}
$posTxt += $lenTxt;
$txtByMd5{$md5} = $t1;
return $t1;
}
}
sub flushSecs {
if(@$secs==0) {
$xml = " <sectiongroup />\n";
} else {
($gzTxt, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
my $lenTxt = 0;
my $sPos = 0; my @sPos = (); my @sLen = ();
foreach my $secTitle (keys %$secNrsByTitle) {
for(my $i=0; $i<@{$secNrsByTitle->{$secTitle}}; $i++) {
my $sNr = $secNrsByTitle->{$secTitle}[$i];
$sPos[$sNr] = $sPos;
$sLen[$sNr] = do { use bytes; length $secs->[$sNr] };
$sPos += $sLen[$sNr];
($buf, $status) = $gzTxt->deflate($secs->[$sNr]);
print TXT $buf if $saveText;
$lenTxt += do { use bytes; length $buf };
}
}
($buf, $status) = $gzTxt->flush();
print TXT $buf if $saveText;
$lenTxt += do { use bytes; length $buf };
$xml = " <sectiongroup offset=\"$posTxt\" length=\"$lenTxt\">\n";
for(my $i=0; $i<@$secs; $i++) {
$xml .= " <section offset=\"$sPos[$i]\" length=\"$sLen[$i]\" ".
"md5=\"$secsMd5->[$i]\" title=\"$secsTitle->[$i]\" />\n";
}
$xml .= " </sectiongroup>\n";
$posTxt += $lenTxt;
}
($buf, $status) = $gzRev->deflate($xml);
print REV $buf;
$lenRev += do { use bytes; length $buf };
}
sub flushRevs {
my $lastBlock = $_[0];
if($revNr == @revs) {
if(@revs==1) {
$revs[0] =~ s/(<text.*?>)(.*?)(<\/text>)/text($1,$2,$3,1,$revNrs[$i])/es;
$xml = $page . $revs[0] . " </page>\n";
print REV $xml;
$lenRev = do { use bytes; length $xml };
$cm = 0;
return;
}
($gzRev, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
($buf, $status) = $gzRev->deflate($page);
print REV $buf;
$lenRev = do { use bytes; length $buf };
$cm = 1;
}
$secs = []; $secNrByMd5 = {}; $secNrsByTitle = {}; $secsMd5 = []; $secsTitle = [];
%secNrsBySeq = (); @secLastOccurrence = ();
for(my $i=0; $i<@revs; $i++) {
$revs[$i] =~ s/(<text.*?>)(.*?)(<\/text>)/text($1,$2,$3,0,$revNrs[$i])/es;
}
flushSecs();
updateLastOccurrences();
for($i=0; $i<@revs; $i++) {
($buf, $status) = $gzRev->deflate($revs[$i]);
print REV $buf;
$lenRev += do { use bytes; length $buf };
}
if($lastBlock) {
($buf, $status) = $gzRev->deflate(" </page>\n");
print REV $buf;
$lenRev += do { use bytes; length $buf };
($buf, $status) = $gzRev->flush();
print REV $buf;
$lenRev += do { use bytes; length $buf };
}
@revs = ();
@revNrs = ();
}
unlink "$datadir/$lang$project.idx", "$datadir/$lang$project.nEdits";
tie %idx, "DB_File", "$datadir/$lang$project.idx" unless $delayIdx;
open REV, ">$datadir/$lang$project.rev";
open TXT, ">$datadir/$lang$project.txt" if $saveText;
open SEQ, ">$datadir/$lang$project.seq";
$posRev = 0; $posTxt = 0; $posSeq = 0;
while(<>) {
if((!$inPage || !$nsOk) && /^\s*<page/) {
$page = $_;
@revs = ();
$revNr = 0;
$inPage = 1;
$nsOk = 1;
$lenRev = 0;
%txtByMd5 = ();
%firstOccurrence = ();
%lastOccurrence = ();
@revId = ();
next;
}
next unless $nsOk;
if($inRev) {
if($inText) {
$rev .= $_;
if(/<\/text>$/) {
$inText = 0;
}
} else {
if(/^(\s*<text[^>]*?)( \/)?>/) {
$_ = "$1></text>\n" if defined $2;
$rev .= $_;
$inText = 1 unless defined $2 || $' =~/<\/text>$/;
} elsif(/^\s*<\/revision/) {
$rev .= $_;
push @revs, $rev;
push @revNrs, $revNr++;
flushRevs(0) if @revs==$revsPerBlock;
$inRev = 0;
} elsif(/^\s*<username>(.*?)<\/username>/) {
$rev .= $_;
$nEdits{$1}++ if $makeNEdits;
} elsif(/^\s*<contributor>/) {
$rev .= $_;
$inContributor = 1;
} elsif(/^\s*<\/contributor>/) {
$rev .= $_;
$inContributor = 0;
} elsif(/^\s*<id>(.*?)<\/id>/ && !$inContributor) {
$rev .= $_;
#print STDERR "$title, revNr=$revNr, id=$1\n";
$revId[$revNr] = $1;
} else {
$rev .= $_;
}
}
next;
}
if(/^\s*<revision/) {
$rev = $_;
$inRev = 1;
next;
}
if(/^\s*<\/page/) {
flushRevs(1);
flushSeqs();
$idx{$title} = "$posRev $lenRev $cm $posSeq $nSeqs";
$posRev += $lenRev;
$posSeq += 12*$nSeqs;
$inPage = 0;
#last if $title eq "Aussagenlogik";
next;
}
$page .= $_;
if(/^\s*<title(?:.*?)>(.*?)<\/title>/) {
$title = xmlunesc($1);
if($title =~ /(.+?):(.+)/ && defined $namespace{$1}) {
$namespace = $namespace{$1};
} else {
$namespace = 0;
}
$title =~ s/\s/_/g;
$nsOk = 0 if $namespace!=0;
}
}
close REV;
close TXT if $saveText;
close SEQ;
if($delayIdx) {
tie %idx2, "DB_File", "$datadir/$lang$project.idx";
%idx2 = %idx;
untie %idx2;
} else {
untie %idx;
}
if($makeNEdits) {
tie %nEdits2, "DB_File", "$datadir/$lang$project.nEdits";
%nEdits2 = %nEdits;
untie %nEdits2;
}