Module:zh-usex: difference between revisions

From Wiktionary, the free dictionary
Jump to navigation Jump to search
Content deleted Content added
+nan pron_correction 咧
Wyang (talk | contribs)
reduce size
Line 141: Line 141:
["wuu"] = {},
["wuu"] = {},
}
}

local zh_format_start = "<span style=\"font-size:90%\"><span lang=\"zh\" class=\"Hani\">"
local zh_format_end = "</span></span>"


function export.show(frame)
function export.show(frame)
Line 151: Line 154:
local audio_file = args["a"] or args["audio"] or false
local audio_file = args["a"] or args["audio"] or false
local phonetic = ""
local phonetic = ""
local original_length = mw.ustring.len(gsub(example, "[^一-龯㐀-䶵]", ""))
local variety = args[3] or (ref_list[reference] and ref_list[reference][1] or false) or "MSC"
local variety = args[3] or (ref_list[reference] and ref_list[reference][1] or false) or "MSC"
variety_code = variety_list[variety][2]
variety_code = variety_list[variety][2]
Line 301: Line 304:
end
end
local tag_start = " <span style=\"color:darkgreen; font-size:x-small;\"><span>[</span>" -- "[[[w:MSC|MSC]]" is interpreted poorly, hence the dummy <span>s
local tag_start = " <small><span style=\"color:darkgreen; font-size:x-small;\"><span>[</span>" -- "[[[w:MSC|MSC]]" is interpreted poorly, hence the dummy <span>s
local tag_end = "<span>]</span></span>"
local tag_end = "<span>]</span></span></small>"
if display == "ruby" then
if display == "ruby" then
Line 364: Line 367:
-- indentation, font and identity tags
-- indentation, font and identity tags
if original_length > 10 then
if match(example, "[,。?!、:; ]") then
trad_text = "<dd><span lang=\"zh\" class=\"Hani\">" .. trad_text .. "</span>"
trad_text = "<dd>" .. zh_format_start .. trad_text .. zh_format_end
if phonetic then
if phonetic then
phonetic = "<dl><dd>" .. phonetic
phonetic = "<dl><dd>" .. phonetic
Line 382: Line 385:
if simp_exist then
if simp_exist then
simp_text = "<dd><span lang=\"zh\" class=\"Hani\">" .. simp_text .. "</span>"
simp_text = "<dd>" .. zh_format_start .. simp_text .. zh_format_end
simp_tag = tag_start .. variety_list[variety][1] .. ", <i>[[w:Simplified Chinese|simp.]]</i>" .. tag_end .. "</dd>"
simp_tag = tag_start .. variety_list[variety][1] .. ", <i>[[w:Simplified Chinese|simp.]]</i>" .. tag_end .. "</dd>"
end
end
Line 394: Line 397:
else
else
trad_text = "<span lang=\"zh\" class=\"Hani\">" .. trad_text .. "</span>"
trad_text = zh_format_start .. trad_text .. zh_format_end
divider = "&nbsp; ―&nbsp; "
divider = "&nbsp; ―&nbsp; "
Line 407: Line 410:
if simp_exist then
if simp_exist then
simp_text = " / <span lang=\"zh\" class=\"Hani\">" .. simp_text .. "</span>"
simp_text = " / " .. zh_format_start .. simp_text .. zh_format_end
end
end

Revision as of 12:39, 9 May 2016


This module does the work for {{zh-x}}, see there for more.

Data for this module is found in Module:zh-usex/data.

More (informal) testcases can be found in Template:zh-x/testcases. Put stuff that isn't working there!


local m_zh = require("Module:zh")
local m_zh_data = require("Module:zh/data")
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local sub = mw.ustring.sub
local PAGENAME = PAGENAME or mw.title.getCurrentTitle().text

local export = {}

local variety_list = {
	["MSC"] = { "[[w:Standard Chinese|MSC]]", "cmn", "Pinyin" },
		["M-TJ"] = { "[[w:Tianjin dialect|Tianjin Mandarin]]", "cmn", "Pinyin" },
	
	["CL"] = { "[[w:Classical Chinese|Classical Chinese]]", "cmn", "Pinyin" },
	
	["C"] = { "[[w:Cantonese|Cantonese]]", "yue", "Jyutping" },
		["C-GZ"] = { "[[w:Cantonese|Guangzhou Cantonese]]", "yue", "Jyutping" },
		["C-LIT"] = { "[[w:Cantonese|Literary Cantonese]]", "yue", "Jyutping" },
	
	["MD"] = { "[[w:Min Dong|Min Dong]]", "cdo", "[[w:Bàng-uâ-cê|Bàng-uâ-cê]] / IPA" },
	
	["MN"] = { "[[w:Min Nan|Min Nan]]", "nan", "[[w:Pe̍h-ōe-jī|Pe̍h-ōe-jī]]" },
		["TW"] = { "[[w:Taiwanese|Taiwanese]]", "nan", "[[w:Pe̍h-ōe-jī|Pe̍h-ōe-jī]]" },
		["MN-T"] = { "[[w:Teochew dialect|Teochew]]", "nan-teo", "[[w:zh:潮州話拼音方案|Peng\'im]]" },
		
	["W"] = { "[[w:Wu Chinese|Wu]]", "wuu", "IPA" },
		["SH"] = { "[[w:Shanghainese|Shanghainese]]", "wuu", "IPA" },
		
	["H"] = { "[[w:Hakka Chinese|Hakka]]", "hak", "[[w:Pha̍k-fa-sṳ|Pha̍k-fa-sṳ]]" },
}

local punctuation = {
	[","] = ",",   ["。"] = ".",   ["、"] = ",",
	["?"] = "?",   ["!"] = "!",
	
	["“"] = "‘",    ["”"] = "’",
	["‘"] = "‘",    ["’"] = "’",
	["《"] = "‘",   ["》"] = "’",
	["『"] = "‘",   ["』"] = "’",
	["「"] = "‘",   ["」"] = "’",
	
	["("] = "(",   [")"] = ")",
	[";"] = ";",   [":"] = ":",
	["|"] = "|",    ["—"] = "-",
	["·"] = " ",    ["…"] = "...",

	[" "] = ";",
}

local ref_list = {
	['Analects']   =  { "CL",  "The ''[[w:Analects|Analects]] of Confucius'', circa 475 – 221 BCE" },
	['Hanfeizi']   =  { "CL",  "''[[w:Han Feizi (book)|Han Feizi]]'', circa 2nd century BCE" },
	['Hanshu']     =  { "CL",  "The ''[[w:Book of Han|Book of Han]]'', circa 1st century CE" },
	['Liji']       =  { "CL",  "The ''[[w:Book of Rites|Book of Rites]]'', circa 4th – 2nd century BCE" },
	['Mengzi']     =  { "CL",  "''[[w:Mencius (book)|Mengzi]] (Mencius)'', circa 4th century BCE" }, 
	['Mozi']       =  { "CL",  "''[[w:Mozi|Mozi]] ([[w:zh:墨子 (书)|book]])'', circa 4th century BCE" }, 
	['Shangshu']   =  { "CL",  "The ''[[w:Book of Documents|Book of Documents]]'', circa 4th – 3rd century BCE" },
	['Shiji']      =  { "CL",  "The ''[[w:Records of the Grand Historian|Records of the Grand Historian]]'', by [[w:Sima Qian|Sima Qian]], circa 91 BCE" },
	['Shijing']    =  { "CL",  "The ''[[w:Classic of Poetry|Classic of Poetry]]'', circa 11th – 7th centuries BCE" },
	['Shujing']    =  { "CL",  "The ''[[w:Book of Documents|Book of Documents]]'', circa 7th – 4th centuries BCE" },
	['Shuowen']    =  { "CL",  "''[[w:Shuowen Jiezi|Shuowen Jiezi]]'', circa 2nd century CE" },
	['Houhanshu']  =  { "CL",  "The ''[[w:Book of the Later Han|Book of the Later Han]]'', circa 5th century CE" },
	['Yijing']     =  { "CL",  "''[[w:I Ching|I Ching]]'', 3rd – 2nd millennia BCE" },
	['Zhuangzi']   =  { "CL",  "''[[w:Zhuangzi (book)|Zhuangzi]]'', circa 3rd – 2nd centuries BCE" },
}

local pron_correction = {
	["cmn"] = {
		["吧"] = "ba",
		["的"] = "de", ["都"] = "dōu", 
		["個"] = "ge", ["給"] = "gěi", ["更"] = "gèng",
		["還"] = "hái", 
		["幾"] = "jǐ", ["將"] = "jiāng",
		["了"] = "le", 
		["沒"] = "méi", 
		["漂"] = "piào", 
		["什"] = "shén", 
		["為"] = "wèi",
		["要"] = "yào", 
	},
	["yue"] = {
		["若"] = "joek6",
		["來"] = "loi4",
		["華"] = "waa4",
		["蛇"] = "se4",
	},
	["nan"] = {
		["人"] = "-lâng-",
		["共"] = "-kā-",
		["的"] = "-ê-",
		["講"] = "-kóng-",
		["予"] = "-hō͘-",
		["爸"] = "-pē-",
		["汝"] = "-lí-",
		["𨑨"] = "-chhit-",
		["迌"] = "-thô-",
		["𪜶"] = "-in-",
		["一"] = "-chi̍t-",
		["毋"] = "-m̄-",
		["欲"] = "-beh-",
		["到"] = "-kàu-",
		["閣"] = "-koh-",
		["佇"] = "-tī-",
		["佮"] = "-kah-",
		["仔"] = "-á-",
		["塊"] = "-tè-",
		["也"] = "-iā-",
		["攏"] = "-lóng-",
		["較"] = "-khah-",
		["阿"] = "-a-",
		["嬤"] = "-má-",
		["字"] = "-jī-",
		["多"] = "-to-",
		["士"] = "-sū-",
		["商"] = "-siong-",
		["斷"] = "-tn̄g-",
		["着"] = "-tio̍h-",
		["戴"] = "-tì-",
		["徛"] = "-khiā-",
		["跤"] = "-kha-",
		["誌"] = "-chì-",
		["啥"] = "-siáⁿ-",
		["行"] = "-kiâⁿ-",
		["甲"] = "-kah-",
		["雙"] = "-siang-",
		["日"] = "-ji̍t-",
		["咧"] = "-teh-"
	},
	["wuu"] = {},
}

local polysyllable_pron_correction = {
	["cmn"] = {},
	["yue"] = {},
	["nan"] = {
		["親像"] = "-chhin-chhiūⁿ-",
		["的確"] = "-tek-khak-",
		["歹勢"] = "-pháiⁿ-sè-",
		["請假"] = "-chhéng-ká-"
	},
	["wuu"] = {},
}

local zh_format_start = "<span style=\"font-size:90%\"><span lang=\"zh\" class=\"Hani\">"
local zh_format_end = "</span></span>"

function export.show(frame)
	local args = frame:getParent().args
	local example = args[1] or error("Example unspecified.")
	local translation = args[2] or error("Lacking translation.")
	local reference = args["ref"] or args["r"] or false
	local manual_tr = args["tr"] or false
	local display = args["type"] or args["display_type"] or "plain"
	local audio_file = args["a"] or args["audio"] or false
	local phonetic = ""
	local original_length = mw.ustring.len(gsub(example, "[^一-龯㐀-䶵]", ""))
	local variety = args[3] or (ref_list[reference] and ref_list[reference][1] or false) or "MSC"
	variety_code = variety_list[variety][2]
	
	local link = args["link"] or args["l"] or "yes"
	link = match(link, "n") == nil and not (not match(example, " ") and match(example, "[,。?!﹑]"))
	
	-- automatically boldify pagetitle if nothing is in bold
	if not match(example, "'''") and not punctuation[PAGENAME] then
		example = gsub(example, PAGENAME, "'''" .. PAGENAME .. "'''")
		example = gsub(example, "''''''", "")
	end
	
	-- tidying up the example, making it ready for transcription
	example = gsub(example, "([?!,。、“”…;:‘’|()「」—《》· ])", " %1 ")
	example = gsub(example, "^ *", "")
	example = gsub(example, " *$", "")
	example = gsub(example, " +", " ")
	example = gsub(example, "%'%'%'([^%']+)%'%'%'", "<b>%1</b>")
	example = gsub(example, "</b>(%[[^%[%]]+%])", "%1</b>")
	example = gsub(example, "</b>({[^{}]+})", "%1</b>")
	
	local ruby_start, ruby_mid, ruby_end = "<big><ruby><span class=\"Hani\">", "</span><rp>&nbsp;(</rp><rt><big>", "</big></rt><rp>)</rp></ruby></big>"
	local ruby_words = {}
	local trad_words, simp_words, tr_words = {}, {}, {}
	
	simp_exist = m_zh.ts_determ(example) == "trad" or (match(example, "%[[^%[%]]+%]") and not match(example, "(.)%[%1%]"))
	for word in mw.text.gsplit(example, " ", true) do
		local trad_word, simp_word, tr_word, ruby_word = word, false, false, ""
	
		-- various tricks for linking and display in trad. and simp.
		trad_word = gsub(trad_word, "(.)%[(.)%]", "%1")
		trad_word = gsub(trad_word, "{[^{}]+}", "")
		trad_word = gsub(trad_word, "[%^%.]", "")
		trad_word = gsub(trad_word, "\\", "|")
		
		if simp_exist then
			simp_word = gsub(m_zh.ts(word), ".%[(.)%]", "%1")
			simp_word = gsub(simp_word, "%{[^%}]+%}", "")
			simp_word = gsub(simp_word, "[%^%.]", "")
			simp_word = gsub(simp_word, "\\", "|")
		end
		
		-- produce links
		local contain_pagename = (gsub(gsub(gsub(trad_word, "</?b>", ""), "%^", ""), "-", "") == PAGENAME) and not punctuation[PAGENAME]
		if match(trad_word, "|") or (link and not match(trad_word, "@") and not punctuation[word] and not contain_pagename) then
			if match(trad_word, "<b>.+</b>") then
				trad_word = mw.text.split(trad_word, "-", true)
				for i,val in ipairs(trad_word) do
					if match(val, "<b>") and not match(val, "</b>") then
						trad_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|" .. val .. "</b>]]"
					elseif match(val, "</b>") and not match(val, "<b>") then
						trad_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|<b>" .. val .. "]]"
					else
						trad_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|" .. val .. "]]"
					end
				end
				trad_word = table.concat(trad_word)
			else
				trad_word = "[[" .. trad_word .. "]]"
				trad_word = gsub(trad_word, "%-", "]][[")
			end
			
			if simp_exist then
				if match(simp_word, "<b>.+</b>") then
					simp_word = mw.text.split(simp_word, "-", true)
					for i,val in ipairs(simp_word) do
						if match(val, "<b>") and not match(val, "</b>") then
							simp_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|" .. val .. "</b>]]"
						elseif match(val, "</b>") and not match(val, "<b>") then
							simp_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|<b>" .. val .. "]]"
						else
							simp_word[i] = "[[" .. gsub(val, "</?b>", "") .. "|" .. val .. "]]"
						end
					end
					simp_word = table.concat(simp_word)
				else
					simp_word = "[[" .. simp_word .. "]]"
					simp_word = gsub(simp_word, "%-", "]][[")
				end
			end
		end
		
		trad_word = gsub(trad_word, "@", "")
		simp_word = simp_exist and gsub(simp_word, "@", "")
		
		-- same tricks applied to transcription
		if not manual_tr and (variety_code == "cmn" or variety_code == "yue" or variety_code == "nan") then
			if punctuation[word] then
				tr_word = punctuation[word]
			else
				real_word = true
				tr_word = gsub(word, "@", "")
				tr_word = gsub(tr_word, "%.", " ")
				tr_word = gsub(tr_word, ".+\\", "")
				tr_word = gsub(tr_word, ".%[([^%[%]]+)%]", "%1")
				tr_word = gsub(tr_word, ".</b>(%{[^%}]+%})", "%1</b>")
				tr_word = gsub(tr_word, ".%{([^%}]+)%}", "%1")
				tr_word = gsub(tr_word, ".", pron_correction[variety_code])
				for key,val in pairs(polysyllable_pron_correction[variety_code]) do
					tr_word = gsub(tr_word, key, val)
				end
				if variety_code == "cmn" then
					tr_word = gsub(tr_word, "%-", "")
					tr_word = m_zh.py(tr_word)
				elseif variety_code == "yue" then
					tr_word = gsub(tr_word, ".", m_zh_data.jyutping)
					tr_word = gsub(tr_word, "([a-z])([1-9])(-?)([1-9]?)", "%1%2%3%4 ")
				elseif variety_code == "nan" then
					tr_word = gsub(tr_word, "[一-鿌㐀-䶵 -〿𠀀-𬺯]+", function(text) 
						if m_zh.check_pron(text, 'nan', 1) then
							return gsub(m_zh.check_pron(text, 'nan', 1), "/.+$", "")
						else
							text = gsub(text, ".", function(ch)
								if m_zh.check_pron(ch, 'nan', 1) then
									return gsub(m_zh.check_pron(ch, 'nan', 1), "/.+$", "") .. "-"
								else
									return ch
								end
							end)
							return gsub(text, "-$", "")
						end
					end)
					tr_word = gsub(tr_word, "%-+", "-")
					tr_word = gsub(tr_word, "%-+([^ⁿa-záíúéóḿńàìùèòǹâîûêôāīūēōA-ZÁÍÚÉÓḾŃÀÌÙÈÒǸÂÎÛÊÔĀĪŪĒŌ])", "%1")
					tr_word = gsub(tr_word, "([^ⁿa-záíúéóḿńàìùèòǹâîûêôāīūēōoóòôōA-ZÁÍÚÉÓḾŃÀÌÙÈÒǸÂÎÛÊÔĀĪŪĒŌOÓÒÔŌ̄͘])%-+", "%1")
					tr_word = gsub(tr_word, "<b>", "-<b>")
					tr_word = gsub(tr_word, "</b>", "</b>-")
					tr_word = gsub(tr_word, "^%-+", "")
					tr_word = gsub(tr_word, "%-+$", "")
					tr_word = gsub(tr_word, "%%", "--")
				end
			end
		end
		
		if variety_code == "nan" then
			trad_word = gsub(trad_word, "%%", "")
			simp_word = simp_exist and gsub(simp_word, "%%", "")
		end
		
		if display == "ruby" then
			ruby_word = ruby_start .. trad_word .. (simp_exist and "<br>" .. simp_word or "") .. ruby_mid .. (real_word and tr_word or "") .. ruby_end
			table.insert(ruby_words, ruby_word)
		else
			table.insert(trad_words, trad_word)
			table.insert(simp_words, simp_word or nil)
			table.insert(tr_words, tr_word or nil)
		end
	end
	
	local tag_start = " <small><span style=\"color:darkgreen; font-size:x-small;\"><span>[</span>" -- "[[[w:MSC|MSC]]" is interpreted poorly, hence the dummy <span>s
	local tag_end = "<span>]</span></span></small>"
	
	if display == "ruby" then
		tag = " <ruby><rb><big>" ..
				tag_start .. variety_list[variety][1] .. 
					(simp_exist
						and ", ''[[Traditional Chinese|trad.]]''↑ + ''[[Simplified Chinese|simp.]]''↓"
						or ", ''[[Traditional Chinese|trad.]]'' and ''[[Simplified Chinese|simp.]]''") .. tag_end .. 
				
				tag_start .. "''rom.'': " .. variety_list[variety][3] .. tag_end ..
					"</rb></big></ruby>"
			
		return table.concat(ruby_words, "") .. tag .. "<dl><dd><i>" .. translation .. "</i></dd></dl>"
	else
		trad_text = table.concat(trad_words)
		simp_text = simp_exist and table.concat(simp_words) or false
		phonetic = manual_tr or (#tr_words > 0 and table.concat(tr_words, " ") or false)

		-- overall transcription formatting
		if phonetic then
			phonetic = gsub(phonetic, " </b>", "</b> ")
			phonetic = gsub(phonetic, "  ", " ")
			if variety_code == "yue" then
				phonetic = gsub(phonetic, "([a-z]+)([1-9%-]+)", "%1<sup>%2</sup>") -- superscript jyutping tones
			end
			phonetic = gsub(phonetic, " ([,%.?!;:’)])", "%1") -- remove excess spaces from punctiation
				phonetic = gsub(phonetic, "([‘(]) ", "%1")
			phonetic = gsub(phonetic, "[‘’]", "\"")
			if not manual_tr then
				phonetic = gsub(phonetic, "%'([^%'])", "%1") -- allow bolding for manual translit
				if variety_code == "nan" then
					phonetic = gsub(phonetic, " +%-%-", "--")
				end
			end
			
			-- capitalisation
			if match(example, "[。?!]") then
				phonetic = "^" .. gsub(phonetic, "([%.?!]) ", "%1 ^")
			end
			phonetic = gsub(phonetic, "%^<b>", "<b>^")
			phonetic = gsub(phonetic, "%^+.", mw.ustring.upper)
			phonetic = gsub(phonetic, "%^", "")
				
			if variety_code == "wuu" then
				local wuu_pron = require("Module:wuu-pron") 
				phonetic = "<span class=\"IPA\">[" .. wuu_pron.ipa_conv(phonetic) .. "]</span>"
			
			elseif variety_code == "cdo" then
				local cdo_pron = require("Module:cdo-pron")
				phonetic = "<i>" .. phonetic .. "</i> / " .. 
					(not match(phonetic, "-[^ ]+-")
						and "<span class=\"IPA\"><small>[" .. cdo_pron.sentence(phonetic) .. "]</small></span>"
						or "")
				
			else
				phonetic = "<i>" .. phonetic .. "</i>"
			end
			phonetic = "<span style=\"color:#404D52\">" .. phonetic .. "</span>"
		end
	end
	
	-- indentation, font and identity tags
	if original_length > 10 then
		trad_text = "<dd>" .. zh_format_start .. trad_text .. zh_format_end
		if phonetic then
			phonetic = "<dl><dd>" .. phonetic
			translation = "<dd>" .. translation .. "</dd></dl>"
			tr_tag = tag_start .. variety_list[variety][3] .. tag_end .. "</dd>"
		else
			translation = "<dl><i>" .. translation .. "</i></dl>"
		end
		
		if audio_file then
			audio = "<dd>[[File:" .. audio_file .. "]]</dd>"
		end
		
		trad_tag = tag_start .. variety_list[variety][1] .. ", <i>[[w:Traditional Chinese|trad.]]" .. 
			(simp_exist and "" or " and [[w:Simplified Chinese|simp.]]") .. "</i>" .. tag_end .. "</dd>"
		
		if simp_exist then
			simp_text = "<dd>" .. zh_format_start .. simp_text .. zh_format_end
			simp_tag = tag_start .. variety_list[variety][1] .. ", <i>[[w:Simplified Chinese|simp.]]</i>" .. tag_end .. "</dd>"
		end
		
		if reference then
			reference = "<dd><i><small>From:</i> " .. (ref_list[reference] and ref_list[reference][2] or reference) .. "</small></dd>"
		end
		
		return trad_text .. trad_tag .. (simp_text or "") .. (simp_tag or "") .. (reference or "") .. 
			(phonetic and phonetic .. tr_tag or "") .. (audio or "") .. translation
		
	else
		trad_text = zh_format_start .. trad_text .. zh_format_end
		divider = "&nbsp; ―&nbsp; "
		
		if variety_code ~= "cmn" then
			ts_tag = tag_start .. variety_list[variety][1] .. tag_end
			tr_tag = tag_start .. variety_list[variety][3] .. tag_end
		end
		
		if not phonetic then
			translation = "<i>" .. translation .. "</i>"
		end
		
		if simp_exist then
			simp_text = " / " .. zh_format_start .. simp_text .. zh_format_end
		end
		
		if audio_file then
			audio = " [[File:" .. audio_file .. "]]"
		end
		
		return trad_text .. (simp_text or "") .. (ts_tag or "") .. divider .. 
			(phonetic and phonetic .. (tr_tag or "") .. (audio or "") .. divider or "") .. translation
	end
end

return export