Benutzer:Jah/ppDump.js

aus Wikipedia, der freien Enzyklopädie
Zur Navigation springen Zur Suche springen

Hinweis: Leere nach dem Veröffentlichen den Browser-Cache, um die Änderungen sehen zu können.

  • Firefox/Safari: Umschalttaste drücken und gleichzeitig Aktualisieren anklicken oder entweder Strg+F5 oder Strg+R (⌘+R auf dem Mac) drücken
  • Google Chrome: Umschalttaste+Strg+R (⌘+Umschalttaste+R auf dem Mac) drücken
  • Edge: Strg+F5 drücken oder Strg drücken und gleichzeitig Aktualisieren anklicken
#!/usr/bin/perl -w

use DB_File;
use Compress::Zlib;
use Digest::MD5 qw(md5 md5_base64);

die "usage: ppDump lang project" unless @ARGV==2;
($lang, $project) = @ARGV; @ARGV = ();

$datadir = "data";
mkdir $datadir unless -d $datadir;

$revsPerBlock=50;
$makeNEdits=1;
$saveText=1;
$delayIdx=0;
# lws = length of word sequences; One Chinese character is treated as one word.
$lws = ($lang =~ /^(zh|ja|ko)$/)?8:5;

sub xmlunesc {
	my $text = shift;
	$text =~ s/&lt;/</sg;
	$text =~ s/&gt;/>/sg;
	$text =~ s/&apos;/'/sg;
	$text =~ s/&quot;/"/sg;
	$text =~ s/&amp;/&/sg;
	$text;
}

open MW, ">$datadir/$lang$project.mw";
$mw = <>;
print MW $mw;
close MW;

sub xmlsiteinfo {
	while($si =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) {
		$namespace{$2} = $1 if defined $2;
		$image = $2 if $1==6;
		$category = $2 if $1==14;
	}
	open SITEINFO, ">$datadir/$lang$project.si";
	print SITEINFO $si;
	close SITEINFO;
}

while(<>) {
	$si .= $_;
	if(/^\s*<\/siteinfo/) {
		xmlsiteinfo();
		last;
	}
}

use Inline C => <<'END';
#include "unicodeAttributes.h"

void DJBHashes(unsigned char* sec, int revNr, int secNr, HV* firstOccurrence, HV* secNrsBySeq, int lws) {
	int wStart[lws], wEnd[lws]; // ring buffers: start and end(excl) of words
	int pos=0, wNr=0;
	int b1, b2, b3, b4, c, inWord=0, inCJK=0;
	unsigned char hashHex[9];
	do {
		int pos0 = pos;
		b1=sec[pos++];
		if (b1<128) {
			c = b1;
		} else if(b1<192) {
			continue;
		} else if(b1<224) {
			b2 = sec[pos++];
			c = ((b1&31)<<6) | (b2&63);
		} else if(b1<240) {
			b2 = sec[pos++];
			b3 = sec[pos++];
			c = ((b1&15)<<12) | ((b2&63)<<6) | (b3&63);
		} else if(b1<248) {
			b2 = sec[pos++];
			b3 = sec[pos++];
			b4 = sec[pos++];
			c = ((b1&7)<<18) | ((b2&63)<<12) | ((b3&63)<<6) | (b4&63);
		} else {
			continue;
		}
		if(c>=17*65536)
			continue;
		unsigned char generalCategory = unicodeGC[c] & 0x7f;
		unsigned char isCJK = unicodeGC[c] >= 0x80;
		int isAlNum =
			generalCategory == unicode_Lu ||
			generalCategory == unicode_Ll ||
			generalCategory == unicode_Lt ||
			generalCategory == unicode_Lm ||
			generalCategory == unicode_Lo ||
			generalCategory == unicode_Nd ||
			generalCategory == unicode_Nl ||
			generalCategory == unicode_No;
		int newWord = 0;
		if(inCJK) {
			wEnd[wNr++%lws]=pos0;
			inCJK=0;
			newWord=1;
		} else if(inWord && (!isAlNum || isCJK)) {
			wEnd[wNr++%lws]=pos0;
			inWord=0;
			newWord=1;
		}
		if(newWord && wNr>=lws) {
			unsigned int hash=5381;
			int j;
			for(j=wNr-lws; j<wNr; j++) {
				int k;
				for(k=wStart[j%lws]; k<wEnd[j%lws]; k++)
					hash = 33*hash+sec[k];
				if(j<wNr-1)
					hash = 33*hash+' ';
			}
			SV* hashSV = sv_2mortal(newSViv(hash));
			if(!hv_exists_ent(firstOccurrence, hashSV, 0))
				hv_store_ent(firstOccurrence, hashSV, newSViv(revNr), 0);
			AV* secNrs;
			HE* secNrsHE = hv_fetch_ent(secNrsBySeq, hashSV, 0, 0);
			if(!secNrsHE) {
				secNrs = newAV();
				hv_store_ent(secNrsBySeq, hashSV, newRV_noinc((SV*)secNrs), 0);
			} else {
				secNrs = (AV*)SvRV(HeVAL(secNrsHE));
			}
			av_push(secNrs, newSViv(secNr));
		}
		if(isCJK) {
			wStart[wNr%lws] = pos0;
			inCJK=1;
		} else if(isAlNum && !inWord) {
			wStart[wNr%lws] = pos0;
			inWord=1;
		}
	} while(c!=0);
}
END

%entityToUnicode = (
	zwnj => 8204, zwj => 8205, lrm => 8206, rlm => 8207, 
	nbsp => 160, iexcl => 161, cent => 162, pound => 163, 
	curren => 164, yen => 165, brvbar => 166, sect => 167, 
	uml => 168, copy => 169, ordf => 170, laquo => 171, 
	not => 172, shy => 173, reg => 174, macr => 175, 
	deg => 176, plusmn => 177, sup2 => 178, sup3 => 179, 
	acute => 180, micro => 181, para => 182, middot => 183, 
	cedil => 184, sup1 => 185, ordm => 186, raquo => 187, 
	frac14 => 188, frac12 => 189, frac34 => 190, iquest => 191, 
	Agrave => 192, Aacute => 193, Acirc => 194, Atilde => 195, 
	Auml => 196, Aring => 197, AElig => 198, Ccedil => 199, 
	Egrave => 200, Eacute => 201, Ecirc => 202, Euml => 203, 
	Igrave => 204, Iacute => 205, Icirc => 206, Iuml => 207, 
	ETH => 208, Ntilde => 209, Ograve => 210, Oacute => 211, 
	Ocirc => 212, Otilde => 213, Ouml => 214, times => 215, 
	Oslash => 216, Ugrave => 217, Uacute => 218, Ucirc => 219, 
	Uuml => 220, Yacute => 221, THORN => 222, szlig => 223, 
	agrave => 224, aacute => 225, acirc => 226, atilde => 227, 
	auml => 228, aring => 229, aelig => 230, ccedil => 231, 
	egrave => 232, eacute => 233, ecirc => 234, euml => 235, 
	igrave => 236, iacute => 237, icirc => 238, iuml => 239, 
	eth => 240, ntilde => 241, ograve => 242, oacute => 243, 
	ocirc => 244, otilde => 245, ouml => 246, divide => 247, 
	oslash => 248, ugrave => 249, uacute => 250, ucirc => 251, 
	uuml => 252, yacute => 253, thorn => 254, yuml => 255, 
	fnof => 402, Alpha => 913, Beta => 914, Gamma => 915, 
	Delta => 916, Epsilon => 917, Zeta => 918, Eta => 919, 
	Theta => 920, Iota => 921, Kappa => 922, Lambda => 923, 
	Mu => 924, Nu => 925, Xi => 926, Omicron => 927, 
	Pi => 928, Rho => 929, Sigma => 931, Tau => 932, 
	Upsilon => 933, Phi => 934, Chi => 935, Psi => 936, 
	Omega => 937, alpha => 945, beta => 946, gamma => 947, 
	delta => 948, epsilon => 949, zeta => 950, eta => 951, 
	theta => 952, iota => 953, kappa => 954, lambda => 955, 
	mu => 956, nu => 957, xi => 958, omicron => 959, 
	pi => 960, rho => 961, sigmaf => 962, sigma => 963, 
	tau => 964, upsilon => 965, phi => 966, chi => 967, 
	psi => 968, omega => 969, thetasym => 977, upsih => 978, 
	piv => 982, bull => 8226, hellip => 8230, prime => 8242, 
	Prime => 8243, oline => 8254, frasl => 8260, weierp => 8472, 
	image => 8465, real => 8476, trade => 8482, alefsym => 8501, 
	larr => 8592, uarr => 8593, rarr => 8594, darr => 8595, 
	harr => 8596, crarr => 8629, lArr => 8656, uArr => 8657, 
	rArr => 8658, dArr => 8659, hArr => 8660, forall => 8704, 
	part => 8706, exist => 8707, empty => 8709, nabla => 8711, 
	isin => 8712, notin => 8713, ni => 8715, prod => 8719, 
	sum => 8721, minus => 8722, lowast => 8727, radic => 8730, 
	prop => 8733, infin => 8734, ang => 8736, and => 8743, 
	or => 8744, cap => 8745, cup => 8746, int => 8747, 
	there4 => 8756, sim => 8764, cong => 8773, asymp => 8776, 
	ne => 8800, equiv => 8801, le => 8804, ge => 8805, 
	sub => 8834, sup => 8835, nsub => 8836, sube => 8838, 
	supe => 8839, oplus => 8853, otimes => 8855, perp => 8869, 
	sdot => 8901, lceil => 8968, rceil => 8969, lfloor => 8970, 
	rfloor => 8971, lang => 9001, rang => 9002, loz => 9674, 
	spades => 9824, clubs => 9827, hearts => 9829, diams => 9830, 
	quot => 34, amp => 38, lt => 60, gt => 62, 
	OElig => 338, oelig => 339, Scaron => 352, scaron => 353, 
	Yuml => 376, circ => 710, tilde => 732, ensp => 8194, 
	emsp => 8195, thinsp => 8201, zwnj => 8204, zwj => 8205, 
	lrm => 8206, rlm => 8207, ndash => 8211, mdash => 8212, 
	lsquo => 8216, rsquo => 8217, sbquo => 8218, ldquo => 8220, 
	rdquo => 8221, bdquo => 8222, dagger => 8224, Dagger => 8225, 
	permil => 8240, lsaquo => 8249, rsaquo => 8250, euro => 8364, 
);

sub entityToUnicodeChar {
	my $entity = shift;
	if($entity =~ /&(\w+);/) {
		if(defined $entityToUnicode{$1}) {
			return pack "U", $entityToUnicode{$1};
		} else {
			return "";
		}
	} elsif($entity =~ /&#(\d+);/) {
		if($1<65536) {
			return chr $1;
		} else {
			return "";
		}
	} elsif($entity =~ /&#x([0-9A-Fa-f]+);/) {
		my $ucVal = hex $1;
		if($ucVal<65536) {
			return pack "U", $ucVal;
		} else {
			return "";
		}
	}
}

sub findWordSequences {
	my ($sec, $secNr, $revNr) = @_;

	return if do { use bytes; length $sec } > 200000;

	$sec = xmlunesc($sec);
#	$sec =~ s/^=+.*?=+//;
	$sec =~ s/\[\[(?:.{2,3}|minnan|simple):.+?\]\]//isg;
	$sec =~ s/\[\[(?!(?:$category|$image|category|image):)([^\]\|]*?\|)?([^\|]+?)\]\]/$2/isg;
	while($sec =~ s/\{\{((?!\{\{).)*?\}\}//sg) {}
	while($sec =~ s/\{\|((?!\{\|).)*?\|\}//sg) {}
	while($sec =~ s/<table((?!<table).)*?<\/table>//isg) {}
	$sec =~ s/\[\[(?:$category|$image|category|image):.*?\]\]//isg;
	$sec =~ s/\*? ?\[(?:http|ftp|mailto).*?\]//isg;
	$sec =~ s/\*? ?(?:http|ftp|mailto):\S*//isg;
	$sec =~ s/<math>.*?<\/math>//sg;
	$sec =~ s/<(?:div|font|span).*?>//isg;
	$sec =~ s/<.{1,10}?>/ /sg;
	$sec =~ s/(&\w+|#(?:\d+|x[0-9A-Fa-f]+);)/entityToUnicodeChar($1)/esg;

	DJBHashes($sec, $revNr, $secNr, \%firstOccurrence, \%secNrsBySeq, $lws);
}

sub updateLastOccurrences {
	foreach my $seq (keys %secNrsBySeq) {
		for(my $i=0; $i<@{$secNrsBySeq{$seq}}; $i++) {
			my $lOtmp = $secLastOccurrence[$secNrsBySeq{$seq}[$i]];
			$lastOccurrence{$seq} = $lOtmp if !defined($lastOccurrence{$seq}) || $lOtmp>$lastOccurrence{$seq};
		}
	}
}

sub flushSeqs {
	$nSeqs = keys %firstOccurrence;
	for my $seq (keys %firstOccurrence) {
		print SEQ pack("N3", $seq, $revId[$firstOccurrence{$seq}], $revId[$lastOccurrence{$seq}]);
	}
}

sub text {
	my ($t1, $t2, $t3, $dontSplit, $revNr) = @_;
#	return "$t1$t2$t3";
	my $lenTxt = do { use bytes; length $t2 };
	if(!$dontSplit) {
		my @s = ();
		@s = split(/\n(?===[^\n]*?==\r?\n)/, $t2);
		my @sNrs = (); my @sLen = (); my @sMd5 = ();
		for(my $i=0; $i<@s; $i++) {
			$s[$i] .= "\n" if $i<@s-1;
			my $smd5 = md5_base64 $s[$i];
			push @sMd5, $smd5;
			my $secTitle = "0,";
			if($s[$i] =~ /^([=]{2,}) *([^\n]*?) *\1\r?\n/s) {
				$secTitle = length($1).",$2";
			}
			if(defined $secNrByMd5->{$smd5}) {
				push @sNrs, $secNrByMd5->{$smd5};
				$secLastOccurrence[$secNrByMd5->{$smd5}] = $revNr;
			} else {
				my $sNr = @$secs;
				push @sNrs, $sNr;
				$secNrByMd5->{$smd5} = $sNr;
				push @{$secNrsByTitle->{$secTitle}}, $sNr;
				push @$secs, $s[$i];
				push @$secsMd5, $smd5;
				push @$secsTitle, $secTitle;
				push @secLastOccurrence, $revNr;
				findWordSequences($s[$i], @$secs-1, $revNr);
			}
		}
		my $md5 = md5_base64(join(" ", @sMd5));
		$t1 = "<text type=\"sectionlist\" length=\"$lenTxt\" md5=\"$md5\">";
		$t2 = join(" ", @sNrs);
		return "$t1$t2$t3";
	} else {
		my $md5 = md5_base64($t2);
		return $txtByMd5{$md5} if defined $txtByMd5{$md5};
		if($lenTxt>100) {
			($gzTxt, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
			($buf,   $status) = $gzTxt->deflate($t2);
			print TXT $buf if $saveText;
			my $lenTxtGz = do { use bytes; length $buf };
			($buf,   $status) = $gzTxt->flush();
			print TXT $buf if $saveText;
			$lenTxtGz += do { use bytes; length $buf };
			$t1 = "<text offset=\"$posTxt\" lengthGz=\"$lenTxtGz\" length=\"$lenTxt\" md5=\"$md5\" />";
		} else {
			print TXT $t2 if $saveText;
			$t1 = "<text offset=\"$posTxt\" length=\"$lenTxt\" md5=\"$md5\" />";
		}
		$posTxt += $lenTxt;
		$txtByMd5{$md5} = $t1;
		return $t1;
	}
}

sub flushSecs {
	if(@$secs==0) {
		$xml = "    <sectiongroup />\n";
	} else {
		($gzTxt, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
		my $lenTxt = 0;
		my $sPos = 0; my @sPos = (); my @sLen = ();
		foreach my $secTitle (keys %$secNrsByTitle) {
			for(my $i=0; $i<@{$secNrsByTitle->{$secTitle}}; $i++) {
				my $sNr  = $secNrsByTitle->{$secTitle}[$i];
				$sPos[$sNr] = $sPos;
				$sLen[$sNr] = do { use bytes; length $secs->[$sNr] };
				$sPos += $sLen[$sNr];
				($buf, $status) = $gzTxt->deflate($secs->[$sNr]);
				print TXT $buf if $saveText;
				$lenTxt += do { use bytes; length $buf };
			}
		}
		($buf, $status) = $gzTxt->flush();
		print TXT $buf if $saveText;
		$lenTxt += do { use bytes; length $buf };
		$xml = "    <sectiongroup offset=\"$posTxt\" length=\"$lenTxt\">\n";
		for(my $i=0; $i<@$secs; $i++) {
			$xml .= "      <section offset=\"$sPos[$i]\" length=\"$sLen[$i]\" ".
				"md5=\"$secsMd5->[$i]\" title=\"$secsTitle->[$i]\" />\n";
		}
		$xml .= "    </sectiongroup>\n";
		$posTxt += $lenTxt;
	}
	($buf, $status) = $gzRev->deflate($xml);
	print REV $buf;
	$lenRev += do { use bytes; length $buf };
}

sub flushRevs {
	my $lastBlock = $_[0];
	if($revNr == @revs) {
		if(@revs==1) {
			$revs[0] =~ s/(<text.*?>)(.*?)(<\/text>)/text($1,$2,$3,1,$revNrs[$i])/es;
			$xml = $page . $revs[0] . "  </page>\n";
			print REV $xml;
			$lenRev = do { use bytes; length $xml };
			$cm = 0;
			return;
		}
		($gzRev, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
		($buf, $status) = $gzRev->deflate($page);
		print REV $buf;
		$lenRev = do { use bytes; length $buf };
		$cm = 1;
	}
	$secs = []; $secNrByMd5 = {}; $secNrsByTitle = {}; $secsMd5 = []; $secsTitle = [];
	%secNrsBySeq = (); @secLastOccurrence = ();
	for(my $i=0; $i<@revs; $i++) {
		$revs[$i] =~ s/(<text.*?>)(.*?)(<\/text>)/text($1,$2,$3,0,$revNrs[$i])/es;
	}
	flushSecs();
	updateLastOccurrences();
	for($i=0; $i<@revs; $i++) {
		($buf, $status) = $gzRev->deflate($revs[$i]);
		print REV $buf;
		$lenRev += do { use bytes; length $buf };
	}
	if($lastBlock) {
		($buf, $status) = $gzRev->deflate("  </page>\n");
		print REV $buf;
		$lenRev += do { use bytes; length $buf };
		($buf, $status) = $gzRev->flush();
		print REV $buf;
		$lenRev += do { use bytes; length $buf };
	}
	@revs = ();
	@revNrs = ();
}

unlink "$datadir/$lang$project.idx", "$datadir/$lang$project.nEdits";
tie %idx, "DB_File", "$datadir/$lang$project.idx" unless $delayIdx;
open REV, ">$datadir/$lang$project.rev";
open TXT, ">$datadir/$lang$project.txt" if $saveText;
open SEQ, ">$datadir/$lang$project.seq";
$posRev = 0; $posTxt = 0; $posSeq = 0;
while(<>) {
	if((!$inPage || !$nsOk) && /^\s*<page/) {
		$page = $_;
		@revs = ();
		$revNr = 0;
		$inPage = 1;
		$nsOk = 1;
		$lenRev = 0;
		%txtByMd5 = ();
		%firstOccurrence = ();
		%lastOccurrence = ();
		@revId = ();
		next;
	}
	next unless $nsOk;
	if($inRev) {
		if($inText) {
			$rev .= $_;
			if(/<\/text>$/) {
				$inText = 0;
			}
		} else {
			if(/^(\s*<text[^>]*?)( \/)?>/) {
				$_ = "$1></text>\n" if defined $2;
				$rev .= $_;
				$inText = 1 unless defined $2 || $' =~/<\/text>$/;
			} elsif(/^\s*<\/revision/) {
				$rev .= $_;
				push @revs, $rev;
				push @revNrs, $revNr++;
				flushRevs(0) if @revs==$revsPerBlock;
				$inRev = 0;
			} elsif(/^\s*<username>(.*?)<\/username>/) {
				$rev .= $_;
				$nEdits{$1}++ if $makeNEdits;
			} elsif(/^\s*<contributor>/) {
				$rev .= $_;
				$inContributor = 1;
			} elsif(/^\s*<\/contributor>/) {
				$rev .= $_;
				$inContributor = 0;
			} elsif(/^\s*<id>(.*?)<\/id>/ && !$inContributor) {
				$rev .= $_;
#print STDERR "$title, revNr=$revNr, id=$1\n";
				$revId[$revNr] = $1;
			} else {
				$rev .= $_;
			}
		}
		next;
	}
	if(/^\s*<revision/) {
		$rev = $_;
		$inRev = 1;
		next;
	}
	if(/^\s*<\/page/) {
		flushRevs(1);
		flushSeqs();
		$idx{$title} = "$posRev $lenRev $cm $posSeq $nSeqs";
		$posRev += $lenRev;
		$posSeq += 12*$nSeqs;
		$inPage = 0;
#last if $title eq "Aussagenlogik";
		next;
	}
	$page .= $_;
	if(/^\s*<title(?:.*?)>(.*?)<\/title>/) {
		$title = xmlunesc($1);
		if($title =~ /(.+?):(.+)/ && defined $namespace{$1}) {
			$namespace = $namespace{$1};
		} else {
			$namespace = 0;
		}
		$title =~ s/\s/_/g;
		$nsOk = 0 if $namespace!=0;
	}
}
close REV;
close TXT if $saveText;
close SEQ;

if($delayIdx) {
	tie %idx2, "DB_File", "$datadir/$lang$project.idx";
	%idx2 = %idx;
	untie %idx2;
} else {
	untie %idx;
}

if($makeNEdits) {
	tie %nEdits2, "DB_File", "$datadir/$lang$project.nEdits";
	%nEdits2 = %nEdits;
	untie %nEdits2;
}