zdv2.bktei.com Git - BK-2020-03.git/blame_incremental

Commit	Line	Data
	1	#!/usr/bin/env python\r
	2	# May be run with either Python 2 or Python 3\r
	3	\r
	4	"""lexconvert v0.32 - convert phonemes between different speech synthesizers etc\r
	5	(c) 2007-20 Silas S. Brown. License: GPL"""\r
	6	\r
	7	# Run without arguments for usage information\r
	8	\r
	9	# This program is free software; you can redistribute it and/or modify\r
	10	# it under the terms of the GNU General Public License as published by\r
	11	# the Free Software Foundation; either version 3 of the License, or\r
	12	# (at your option) any later version.\r
	13	#\r
	14	# This program is distributed in the hope that it will be useful,\r
	15	# but WITHOUT ANY WARRANTY; without even the implied warranty of\r
	16	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
	17	# GNU General Public License for more details.\r
	18	\r
	19	# Old versions of this code are being kept in the E-GuideDog SVN repository at\r
	20	# http://svn.code.sf.net/p/e-guidedog/code/ssb22/lexconvert\r
	21	# and on GitHub at https://github.com/ssb22/lexconvert\r
	22	# and on GitLab at https://gitlab.com/ssb22/lexconvert\r
	23	# and on Bitbucket https://bitbucket.org/ssb22/lexconvert\r
	24	# and at https://gitlab.developers.cam.ac.uk/ssb22/lexconvert\r
	25	# although some early ones are missing.\r
	26	\r
	27	def Phonemes():\r
	28	"""Create phonemes by calling vowel(), consonant(),\r
	29	variant() and other().\r
	30	\r
	31	For the variants, if a particular variant does not\r
	32	exist in the destination format then we will treat it\r
	33	as equivalent to the last non-variant we created.\r
	34	\r
	35	For anything else that does not exist in the\r
	36	destination format, we will first try to break the\r
	37	source's phoneme into parts (e.g. see the treatment\r
	38	of opt_ol_as_in_gold by eSpeak and bbcmicro), and if\r
	39	that still doesn't work then we drop a character\r
	40	(warning depending on the source format's setting of\r
	41	safe_to_drop_characters). makeDic does however warn\r
	42	about any non-variant consonants, or non-variant\r
	43	vowels that weren't marked optional, missing from a\r
	44	format. """\r
	45	a_as_in_ah = vowel()\r
	46	_, var1_a_as_in_ah = variant()\r
	47	_, var3_a_as_in_ah = variant()\r
	48	_, var4_a_as_in_ah = variant()\r
	49	_, var5_a_as_in_ah = variant()\r
	50	a_as_in_apple = vowel()\r
	51	u_as_in_but = vowel() # or the first part of un as in hunt\r
	52	_, var1_u_as_in_but = variant()\r
	53	o_as_in_orange = vowel()\r
	54	_, var1_o_as_in_orange = variant()\r
	55	_, var2_o_as_in_orange = variant()\r
	56	o_as_in_now = vowel()\r
	57	_, var1_o_as_in_now = variant()\r
	58	a_as_in_ago = vowel()\r
	59	_, var1_a_as_in_ago = variant()\r
	60	e_as_in_herd = vowel()\r
	61	_, ar_as_in_year = variant()\r
	62	eye = vowel()\r
	63	_, var1_eye = variant()\r
	64	b = consonant()\r
	65	ch = consonant()\r
	66	d = consonant()\r
	67	th_as_in_them = consonant()\r
	68	e_as_in_them = vowel()\r
	69	_, var1_e_as_in_them = variant()\r
	70	a_as_in_air = vowel()\r
	71	_, var1_a_as_in_air = variant()\r
	72	_, var2_a_as_in_air = variant()\r
	73	_, var3_a_as_in_air = variant()\r
	74	_, var4_a_as_in_air = variant()\r
	75	a_as_in_ate = vowel()\r
	76	_, var1_a_as_in_ate = variant()\r
	77	f = consonant()\r
	78	g = consonant()\r
	79	h = consonant()\r
	80	i_as_in_it = vowel()\r
	81	_, var1_i_as_in_it = variant()\r
	82	_, var2_i_as_in_it = variant()\r
	83	ear = vowel()\r
	84	_, var1_ear = variant()\r
	85	_, var2_ear = variant()\r
	86	e_as_in_eat = vowel()\r
	87	_, var1_e_as_in_eat = variant()\r
	88	j_as_in_jump = consonant()\r
	89	k = consonant()\r
	90	_, opt_scottish_loch = variant()\r
	91	l = consonant()\r
	92	_, var1_l = variant()\r
	93	m = consonant()\r
	94	n = consonant()\r
	95	ng = consonant()\r
	96	o_as_in_go = vowel()\r
	97	_, var1_o_as_in_go = variant()\r
	98	_, var2_o_as_in_go = variant()\r
	99	opt_ol_as_in_gold = opt_vowel() # see eSpeak / bbcmicro\r
	100	oy_as_in_toy = vowel()\r
	101	_, var1_oy_as_in_toy = variant()\r
	102	p = consonant()\r
	103	r = consonant()\r
	104	_, var1_r = variant()\r
	105	s = consonant()\r
	106	sh = consonant()\r
	107	t = consonant()\r
	108	_, var1_t = variant()\r
	109	th_as_in_think = consonant()\r
	110	oor_as_in_poor = vowel()\r
	111	_, var1_oor_as_in_poor = variant()\r
	112	_, opt_u_as_in_pull = variant()\r
	113	opt_ul_as_in_pull = opt_vowel() # see eSpeak / bbcmicro\r
	114	oo_as_in_food = vowel()\r
	115	_, var1_oo_as_in_food = variant()\r
	116	_, var2_oo_as_in_food = variant()\r
	117	close_to_or = vowel()\r
	118	_, var1_close_to_or = variant()\r
	119	_, var2_close_to_or = variant()\r
	120	_, var3_close_to_or = variant()\r
	121	v = consonant()\r
	122	w = consonant()\r
	123	_, var1_w = variant()\r
	124	y = consonant()\r
	125	z = consonant()\r
	126	ge_of_blige_etc = consonant()\r
	127	glottal_stop = other()\r
	128	syllable_separator = other()\r
	129	_, primary_stress = variant()\r
	130	_, secondary_stress = variant()\r
	131	text_sharp = other()\r
	132	text_underline = other()\r
	133	text_question = other()\r
	134	text_exclamation = other()\r
	135	text_comma = other()\r
	136	ipa_colon = other() # for catching missed cases\r
	137	del _ ; return locals()\r
	138	\r
	139	def LexFormats():\r
	140	"""Makes the phoneme conversion tables of each format.\r
	141	Each table has string to phoneme entries and phoneme\r
	142	to string entries. The string to phoneme entries are\r
	143	used when converting OUT of that format, and the\r
	144	phoneme to string entries are used when converting IN\r
	145	(so you can recognise phonemes you don't support and\r
	146	convert them to something else). By default, a tuple\r
	147	of the form (string,phoneme) will create entries in\r
	148	BOTH directions; one-directional entries are created\r
	149	via (string,phoneme,False) or (phoneme,string,False).\r
	150	The makeDic function checks the keys are unique.\r
	151	\r
	152	First parameter is always a description of the\r
	153	format, then come the phoneme entries as described\r
	154	above, then any additional settings:\r
	155	\r
	156	stress_comes_before_vowel (default False means any\r
	157	stress mark goes AFTER the affected vowel; set to\r
	158	True if the format requires stress placed before)\r
	159	\r
	160	word_separator (default same as phoneme_separator)\r
	161	phoneme_separator (default " ")\r
	162	clause_separator (default newline)\r
	163	\r
	164	(For a special case, clause_separator can also be\r
	165	set to a function. If that happens, the function\r
	166	will be called whenever lexconvert needs to output\r
	167	a list of (lists of words) in this format. See\r
	168	bbcmicro for an example function clause_separator)\r
	169	\r
	170	safe_to_drop_characters (default False, can be a\r
	171	string of safe characters or True = all; controls\r
	172	warnings when unrecognised characters are found)\r
	173	\r
	174	approximate_missing (default False) - if True,\r
	175	makeDic will attempt to compensate for missing\r
	176	phonemes by approximating them to others, instead of\r
	177	warning about them. This is useful for American codes\r
	178	that can't cope with all the British English phonemes.\r
	179	(Approximation is done automatically anyway in the\r
	180	case of variant phonemes; approximate_missing adds in\r
	181	some additional approximations - see comments in code)\r
	182	\r
	183	cleanup_regexps (default none) - optional list of\r
	184	(search,replace) regular expressions to "clean up"\r
	185	after converting each word INTO this format\r
	186	cleanup_func (default none) - optional special-case\r
	187	function to pass result through after cleanup_regexps\r
	188	\r
	189	cvtOut_regexps (default none) - optional list of\r
	190	(search,replace) regular expressions to "clean up"\r
	191	before starting to convert OUT of this format\r
	192	cvtOut_func (default none) - optional special-case\r
	193	function to pass through before any cvtOut_regexps\r
	194	\r
	195	inline_format (default "%s") the format string for\r
	196	printing a word with --phones or --phones2phones\r
	197	(can be used to put markup around each word)\r
	198	(can also be a function taking the phonetic word\r
	199	and returning the resulting string, e.g. bbcmicro)\r
	200	\r
	201	output_is_binary (default False) - True if the output\r
	202	is almost certainly unsuitable for a terminal; will\r
	203	cause lexconvert to refuse to print phonemes unless\r
	204	its standard output is redirected to a file or pipe\r
	205	(affects the --phones and --phones2phones options)\r
	206	\r
	207	inline_header (default none) text to print first\r
	208	when outputting from --phones or --phones2phones\r
	209	inline_footer (default none) text to print last\r
	210	inline_oneoff_header (default none) text to print\r
	211	before inline_header on the first time only\r
	212	\r
	213	lex_filename - filename of a lexicon file. If this\r
	214	is not specified, there is no support for writing a\r
	215	lexicon in this format: there can still be READ\r
	216	support if you define lex_read_function to open the\r
	217	lexicon by itself, but otherwise the format can be\r
	218	used only with --phones and --phones2phones.\r
	219	\r
	220	lex_entry_format - format string for writing each\r
	221	(word, pronunciation) entry to the lexicon file.\r
	222	This is also needed for lexicon-write support.\r
	223	\r
	224	lex_header, lex_footer - optional strings to write\r
	225	at the beginning and at the end of the lexicon file\r
	226	(can also be functions that take the open file as a\r
	227	parameter, e.g. for bbcmicro; lex_footer is\r
	228	allowed to close the file if it needs to do\r
	229	something with it afterwards)\r
	230	\r
	231	lex_word_case - optional "upper" or "lower" to\r
	232	force a particular case for lexicon words (not\r
	233	pronunciations - they're determined by the table).\r
	234	The default is to allow words to be in either case.\r
	235	\r
	236	lex_type (default "") - used by the --formats\r
	237	option when summarising the support for each format\r
	238	\r
	239	lex_read_function - Python function to READ the\r
	240	lexicon file and return a (word,phonemes) list.\r
	241	If this is not specified, there's no read support\r
	242	for lexicons in this format (but there can still be\r
	243	write support - see above - and you can still use\r
	244	--phones and --phones2phones). If lex_filename is\r
	245	specified then this function will be given the open\r
	246	file as a parameter. """\r
	247	\r
	248	phonemes = Phonemes() ; globals().update(phonemes)\r
	249	return { "festival" : makeDic(\r
	250	"Festival's British voice",\r
	251	('0',syllable_separator),\r
	252	('1',primary_stress),\r
	253	('2',secondary_stress),\r
	254	('aa',a_as_in_ah),\r
	255	('a',a_as_in_apple),\r
	256	('uh',u_as_in_but),\r
	257	('o',o_as_in_orange),\r
	258	('au',o_as_in_now),\r
	259	('@',a_as_in_ago),\r
	260	('@@',e_as_in_herd),\r
	261	('ai',eye),\r
	262	('b',b),\r
	263	('ch',ch),\r
	264	('d',d),\r
	265	('dh',th_as_in_them),\r
	266	('e',e_as_in_them),\r
	267	(ar_as_in_year,'@@',False),\r
	268	('e@',a_as_in_air),\r
	269	('ei',a_as_in_ate),\r
	270	('f',f),\r
	271	('g',g),\r
	272	('h',h),\r
	273	('i',i_as_in_it),\r
	274	('i@',ear),\r
	275	('ii',e_as_in_eat),\r
	276	('jh',j_as_in_jump),\r
	277	('k',k),\r
	278	('l',l),\r
	279	('m',m),\r
	280	('n',n),\r
	281	('ng',ng),\r
	282	('ou',o_as_in_go),\r
	283	('oi',oy_as_in_toy),\r
	284	('p',p),\r
	285	('r',r),\r
	286	('s',s),\r
	287	('sh',sh),\r
	288	('t',t),\r
	289	('th',th_as_in_think),\r
	290	('u@',oor_as_in_poor),\r
	291	('u',opt_u_as_in_pull),\r
	292	('uu',oo_as_in_food),\r
	293	('oo',close_to_or),\r
	294	('v',v),\r
	295	('w',w),\r
	296	('y',y),\r
	297	('z',z),\r
	298	('zh',ge_of_blige_etc),\r
	299	lex_filename=ifset("HOME",os.environ.get("HOME","")+os.sep)+".festivalrc",\r
	300	lex_entry_format="(lex.add.entry '( \"%s\" n %s))\n",\r
	301	lex_header=";; -- mode: lisp --\n(eval (list voice_default))\n",\r
	302	lex_read_function = lambda args:eval('['+getoutput("grep -vi parameter.set < ~/.festivalrc \| grep -v '(eval' \| sed -e 's/;.//' -e 's/.lex.add.entry//' -e s/\"'\"'[(] \"/[\"/' -e 's/\" [^ ] /\",(\"/' -e 's/\".$/&\"],/' -e 's/[()]/ /g' -e 's/ / /g'")+']'),\r
	303	safe_to_drop_characters=True, # TODO: really? (could instead give a string of known-safe characters)\r
	304	cleanup_func = festival_group_stress,\r
	305	),\r
	306	\r
	307	"example" : makeVariantDic(\r
	308	"A small built-in example lexicon for testing when you don't have your full custom lexicon to hand. Use --convert to write it in one of the other formats and see if a synth can import it.",\r
	309	lex_read_function = lambda *args: [\r
	310	("Shadrach","shei1drak"),\r
	311	("Meshach","mii1shak"),\r
	312	("Abednego","@be1dniigou"),\r
	313	], cleanup_func = None,\r
	314	lex_filename=None, lex_entry_format=None, noInherit=True),\r
	315	\r
	316	"festival-cmu" : makeVariantDic(\r
	317	"American CMU version of Festival",\r
	318	('ae',a_as_in_apple),\r
	319	('ah',u_as_in_but),\r
	320	('ax',a_as_in_ago),\r
	321	(o_as_in_orange,'aa',False),\r
	322	('aw',o_as_in_now),\r
	323	('er',e_as_in_herd), # TODO: check this one\r
	324	('ay',eye),\r
	325	('eh',e_as_in_them),\r
	326	(ar_as_in_year,'er',False),\r
	327	(a_as_in_air,'er',False),\r
	328	('ey',a_as_in_ate),\r
	329	('hh',h),\r
	330	('ih',i_as_in_it),\r
	331	('ey ah',ear),\r
	332	('iy',e_as_in_eat),\r
	333	('ow',o_as_in_go),\r
	334	('oy',oy_as_in_toy),\r
	335	('uh',oor_as_in_poor),\r
	336	('uw',oo_as_in_food),\r
	337	('ao',close_to_or),\r
	338	),\r
	339	\r
	340	"espeak" : makeDic(\r
	341	"eSpeak's default British voice", # but eSpeak's phoneme representation isn't always that simple, hence the regexps at the end\r
	342	('%',syllable_separator),\r
	343	("'",primary_stress),\r
	344	(',',secondary_stress),\r
	345	# TODO: glottal_stop? (in regional pronunciations etc)\r
	346	('A:',a_as_in_ah),\r
	347	('A@',a_as_in_ah,False),\r
	348	('A',var1_a_as_in_ah),\r
	349	('a',a_as_in_apple),\r
	350	('aa',a_as_in_apple,False),\r
	351	('a2',a_as_in_apple,False), # TODO: this is actually an a_as_in_apple variant in espeak; festival @1 is not in mrpa PhoneSet\r
	352	('&',a_as_in_apple,False),\r
	353	('V',u_as_in_but),\r
	354	('0',o_as_in_orange),\r
	355	('aU',o_as_in_now),\r
	356	('@',a_as_in_ago),\r
	357	('a#',a_as_in_ago,False), # (TODO: eSpeak sometimes uses a# in 'had' when in a sentence, and this doesn't always sound good on other synths; might sometimes want to convert it to a_as_in_apple; not sure what contexts would call for this though)\r
	358	('3:',e_as_in_herd),\r
	359	('3',var1_a_as_in_ago),\r
	360	('@2',a_as_in_ago,False),\r
	361	('@-',a_as_in_ago,False), # (eSpeak @- sounds to me like a shorter version of @, TODO: double-check the relationship between @ and @2 in Festival)\r
	362	('aI',eye),\r
	363	('aI2',eye,False),\r
	364	('aI;',eye,False),\r
	365	('aI2;',eye,False),\r
	366	('b',b),\r
	367	('tS',ch),\r
	368	('d',d),\r
	369	('D',th_as_in_them),\r
	370	('E',e_as_in_them),\r
	371	(ar_as_in_year,'3:',False),\r
	372	('e@',a_as_in_air),\r
	373	('eI',a_as_in_ate),\r
	374	('f',f),\r
	375	('g',g),\r
	376	('h',h),\r
	377	('I',i_as_in_it),\r
	378	('I;',i_as_in_it,False),\r
	379	('i',i_as_in_it,False),\r
	380	('I2',var2_i_as_in_it,False),\r
	381	('I2;',var2_i_as_in_it,False),\r
	382	('i@',ear),\r
	383	('i@3',var2_ear),\r
	384	('i:',e_as_in_eat),\r
	385	('i:;',e_as_in_eat,False),\r
	386	('dZ',j_as_in_jump),\r
	387	('k',k),\r
	388	('x',opt_scottish_loch),\r
	389	('l',l),\r
	390	('L',l,False),\r
	391	('m',m),\r
	392	('n',n),\r
	393	('N',ng),\r
	394	('oU',o_as_in_go),\r
	395	('oUl',opt_ol_as_in_gold), # (espeak says "gold" in a slightly 'posh' way though) (if dest format doesn't have opt_ol_as_in_gold, it'll get o_as_in_go + the l)\r
	396	('OI',oy_as_in_toy),\r
	397	('p',p),\r
	398	('r',r),\r
	399	('r-',r,False),\r
	400	('s',s),\r
	401	('S',sh),\r
	402	('t',t),\r
	403	('T',th_as_in_think),\r
	404	('U@',oor_as_in_poor),\r
	405	('U',opt_u_as_in_pull),\r
	406	('@5',opt_u_as_in_pull,False),\r
	407	('Ul',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the U, then the l\r
	408	('u:',oo_as_in_food),\r
	409	('O:',close_to_or),\r
	410	('O@',var3_close_to_or),\r
	411	('o@',var3_close_to_or,False),\r
	412	('O',var3_close_to_or,False),\r
	413	('v',v),\r
	414	('w',w),\r
	415	('j',y),\r
	416	('z',z),\r
	417	('Z',ge_of_blige_etc),\r
	418	lex_filename = "en_extra",\r
	419	lex_entry_format = "%s %s\n",\r
	420	lex_read_function = lambda lexfile: [x for x in [l.split()[:2] for l in lexfile.readlines()] if len(x)==2 and not '//' in x[0]],\r
	421	lex_footer=lambda f:(f.close(),os.system("espeak --compile=en")), # see also a bit of special-case code in mainopt_convert\r
	422	inline_format = "[[%s]]",\r
	423	word_separator=" ",phoneme_separator="",\r
	424	stress_comes_before_vowel=True,\r
	425	safe_to_drop_characters="_: !",\r
	426	cleanup_regexps=[\r
	427	("k'a2n","k'@n"),\r
	428	("ka2n","k@n"),\r
	429	("gg","g"),\r
	430	("@U","oU"), # (eSpeak uses oU to represent @U; difference is given by its accent parameters)\r
	431	("([iU]\|([AO]:))@r$","\1@"),\r
	432	("([^e])@r",r"\1_remove_3"),("_remove_",""),\r
	433	# (r"([^iU]@)l",r"\1L") # only in older versions of espeak (not valid in more recent versions)\r
	434	("rr$","r"),\r
	435	("3:r$","3:"),\r
	436	("%%+","%"),("^%",""),("%$",""),\r
	437	# TODO: 'declared' & 'declare' the 'r' after the 'E' sounds a bit 'regional' (but pretty). but sounds incomplete w/out 'r', and there doesn't seem to be an E2 or E@\r
	438	# TODO: consider adding 'g' to words ending in 'N' (if want the 'g' pronounced in '-ng' words) (however, careful of words like 'yankee' where the 'g' would be followed by a 'k'; this may also be a problem going into the next word)\r
	439	],\r
	440	cvtOut_regexps = [\r
	441	("e@r$","e@"), ("e@r([bdDfghklmnNprsStTvwjzZ])",r"e@\1"), # because the 'r' is implicit in other synths (but DO have it if there's another vowel to follow)\r
	442	],\r
	443	),\r
	444	\r
	445	"sapi" : makeDic(\r
	446	"Microsoft Speech API (American English)",\r
	447	('-',syllable_separator),\r
	448	('1',primary_stress),\r
	449	('2',secondary_stress),\r
	450	('aa',a_as_in_ah),\r
	451	('ae',a_as_in_apple),\r
	452	('ah',u_as_in_but),\r
	453	('ao',o_as_in_orange),\r
	454	('aw',o_as_in_now),\r
	455	('ax',a_as_in_ago),\r
	456	('er',e_as_in_herd),\r
	457	('ay',eye),\r
	458	('b',b),\r
	459	('ch',ch),\r
	460	('d',d),\r
	461	('dh',th_as_in_them),\r
	462	('eh',e_as_in_them),\r
	463	('ey',var1_e_as_in_them),\r
	464	(a_as_in_ate,'ey',False),\r
	465	('f',f),\r
	466	('g',g),\r
	467	('h',h), # Jan suggested 'hh', but I can't get this to work on Windows XP (TODO: try newer versions of Windows)\r
	468	('ih',i_as_in_it),\r
	469	('iy',e_as_in_eat),\r
	470	('jh',j_as_in_jump),\r
	471	('k',k),\r
	472	('l',l),\r
	473	('m',m),\r
	474	('n',n),\r
	475	('ng',ng),\r
	476	('ow',o_as_in_go),\r
	477	('oy',oy_as_in_toy),\r
	478	('p',p),\r
	479	('r',r),\r
	480	('s',s),\r
	481	('sh',sh),\r
	482	('t',t),\r
	483	('th',th_as_in_think),\r
	484	('uh',oor_as_in_poor),\r
	485	('uw',oo_as_in_food),\r
	486	('AO',close_to_or),\r
	487	('v',v),\r
	488	('w',w),\r
	489	# ('x',var1_w), # suggested by Jan, but I can't get this to work on Windows XP (TODO: try newer versions of Windows)\r
	490	('y',y),\r
	491	('z',z),\r
	492	('zh',ge_of_blige_etc),\r
	493	approximate_missing=True,\r
	494	lex_filename="run-ptts.bat", # write-only for now\r
	495	lex_header = "rem You have to run this file\nrem with ptts.exe in the same directory\nrem to add these words to the SAPI lexicon\n\n",\r
	496	lex_entry_format='ptts -la %s "%s"\n',\r
	497	inline_format = '<pron sym="%s"/>',\r
	498	safe_to_drop_characters=True, # TODO: really?\r
	499	),\r
	500	\r

1

#!/usr/bin/env python\r

2

# May be run with either Python 2 or Python 3\r

3

\r

4

"""lexconvert v0.32 - convert phonemes between different speech synthesizers etc\r

5

6

\r

7

# Run without arguments for usage information\r

8

\r

9

# This program is free software; you can redistribute it and/or modify\r

10

# it under the terms of the GNU General Public License as published by\r

11

# the Free Software Foundation; either version 3 of the License, or\r

12

# (at your option) any later version.\r

13

#\r

14

# This program is distributed in the hope that it will be useful,\r

15

# but WITHOUT ANY WARRANTY; without even the implied warranty of\r

16

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r

17

# GNU General Public License for more details.\r

18

\r

19

# Old versions of this code are being kept in the E-GuideDog SVN repository at\r

20

# http://svn.code.sf.net/p/e-guidedog/code/ssb22/lexconvert\r

21

# and on GitHub at https://github.com/ssb22/lexconvert\r

22

# and on GitLab at https://gitlab.com/ssb22/lexconvert\r

23

# and on Bitbucket https://bitbucket.org/ssb22/lexconvert\r

24

# and at https://gitlab.developers.cam.ac.uk/ssb22/lexconvert\r

25

# although some early ones are missing.\r

26

\r

27

def Phonemes():\r

28

"""Create phonemes by calling vowel(), consonant(),\r

29

variant() and other().\r

30

\r

31

For the variants, if a particular variant does not\r

32

exist in the destination format then we will treat it\r

33

as equivalent to the last non-variant we created.\r

34

\r

35

For anything else that does not exist in the\r

36

destination format, we will first try to break the\r

37

source's phoneme into parts (e.g. see the treatment\r

38

of opt_ol_as_in_gold by eSpeak and bbcmicro), and if\r

39

that still doesn't work then we drop a character\r

40

(warning depending on the source format's setting of\r

41

safe_to_drop_characters). makeDic does however warn\r

42

about any non-variant consonants, or non-variant\r

43

vowels that weren't marked optional, missing from a\r

44

format. """\r

45

a_as_in_ah = vowel()\r

46

_, var1_a_as_in_ah = variant()\r

47

_, var3_a_as_in_ah = variant()\r

48

_, var4_a_as_in_ah = variant()\r

49

_, var5_a_as_in_ah = variant()\r

50

a_as_in_apple = vowel()\r

51

u_as_in_but = vowel() # or the first part of un as in hunt\r

52

_, var1_u_as_in_but = variant()\r

53

o_as_in_orange = vowel()\r

54

_, var1_o_as_in_orange = variant()\r

55

_, var2_o_as_in_orange = variant()\r

56

o_as_in_now = vowel()\r

57

_, var1_o_as_in_now = variant()\r

58

a_as_in_ago = vowel()\r

59

_, var1_a_as_in_ago = variant()\r

60

e_as_in_herd = vowel()\r

61

_, ar_as_in_year = variant()\r

62

eye = vowel()\r

63

_, var1_eye = variant()\r

b = consonant()\r

ch = consonant()\r

d = consonant()\r

th_as_in_them = consonant()\r

68

e_as_in_them = vowel()\r

69

_, var1_e_as_in_them = variant()\r

70

a_as_in_air = vowel()\r

71

_, var1_a_as_in_air = variant()\r

72

_, var2_a_as_in_air = variant()\r

73

_, var3_a_as_in_air = variant()\r

74

_, var4_a_as_in_air = variant()\r

75

a_as_in_ate = vowel()\r

76

_, var1_a_as_in_ate = variant()\r

f = consonant()\r

g = consonant()\r

h = consonant()\r

i_as_in_it = vowel()\r

81

_, var1_i_as_in_it = variant()\r

82

_, var2_i_as_in_it = variant()\r

83

ear = vowel()\r

84

_, var1_ear = variant()\r

85

_, var2_ear = variant()\r

86

e_as_in_eat = vowel()\r

87

_, var1_e_as_in_eat = variant()\r

88

j_as_in_jump = consonant()\r

89

k = consonant()\r

90

_, opt_scottish_loch = variant()\r

91

l = consonant()\r

92

_, var1_l = variant()\r

m = consonant()\r

n = consonant()\r

ng = consonant()\r

o_as_in_go = vowel()\r

97

_, var1_o_as_in_go = variant()\r

98

_, var2_o_as_in_go = variant()\r

99

opt_ol_as_in_gold = opt_vowel() # see eSpeak / bbcmicro\r

100

oy_as_in_toy = vowel()\r

101

_, var1_oy_as_in_toy = variant()\r

102

p = consonant()\r

103

r = consonant()\r

104

_, var1_r = variant()\r

s = consonant()\r

sh = consonant()\r

t = consonant()\r

_, var1_t = variant()\r

109

th_as_in_think = consonant()\r

110

oor_as_in_poor = vowel()\r

111

_, var1_oor_as_in_poor = variant()\r

112

_, opt_u_as_in_pull = variant()\r

113

opt_ul_as_in_pull = opt_vowel() # see eSpeak / bbcmicro\r

114

oo_as_in_food = vowel()\r

115

_, var1_oo_as_in_food = variant()\r

116

_, var2_oo_as_in_food = variant()\r

117

close_to_or = vowel()\r

118

_, var1_close_to_or = variant()\r

119

_, var2_close_to_or = variant()\r

120

_, var3_close_to_or = variant()\r

121

v = consonant()\r

122

w = consonant()\r

123

_, var1_w = variant()\r

124

y = consonant()\r

125

z = consonant()\r

126

ge_of_blige_etc = consonant()\r

127

glottal_stop = other()\r

128

syllable_separator = other()\r

129

_, primary_stress = variant()\r

130

_, secondary_stress = variant()\r

131

text_sharp = other()\r

132

text_underline = other()\r

133

text_question = other()\r

134

text_exclamation = other()\r

135

text_comma = other()\r

136

ipa_colon = other() # for catching missed cases\r

137

del _ ; return locals()\r

138

\r

139

def LexFormats():\r

140

"""Makes the phoneme conversion tables of each format.\r

141

Each table has string to phoneme entries and phoneme\r

142

to string entries. The string to phoneme entries are\r

143

used when converting OUT of that format, and the\r

144

phoneme to string entries are used when converting IN\r

145

(so you can recognise phonemes you don't support and\r

146

convert them to something else). By default, a tuple\r

147

of the form (string,phoneme) will create entries in\r

148

BOTH directions; one-directional entries are created\r

149

via (string,phoneme,False) or (phoneme,string,False).\r

150

The makeDic function checks the keys are unique.\r

151

\r

152

First parameter is always a description of the\r

153

format, then come the phoneme entries as described\r

154

above, then any additional settings:\r

155

\r

156

stress_comes_before_vowel (default False means any\r

157

stress mark goes AFTER the affected vowel; set to\r

158

True if the format requires stress placed before)\r

159

\r

160

word_separator (default same as phoneme_separator)\r

161

phoneme_separator (default " ")\r

162

clause_separator (default newline)\r

163

\r

164

(For a special case, clause_separator can also be\r

165

set to a function. If that happens, the function\r

166

will be called whenever lexconvert needs to output\r

167

a list of (lists of words) in this format. See\r

168

bbcmicro for an example function clause_separator)\r

169

\r

170

safe_to_drop_characters (default False, can be a\r

171

string of safe characters or True = all; controls\r

172

warnings when unrecognised characters are found)\r

173

\r

174

approximate_missing (default False) - if True,\r

175

makeDic will attempt to compensate for missing\r

176

phonemes by approximating them to others, instead of\r

177

warning about them. This is useful for American codes\r

178

that can't cope with all the British English phonemes.\r

179

(Approximation is done automatically anyway in the\r

180

case of variant phonemes; approximate_missing adds in\r

181

some additional approximations - see comments in code)\r

182

\r

183

cleanup_regexps (default none) - optional list of\r

184

(search,replace) regular expressions to "clean up"\r

185

after converting each word INTO this format\r

186

cleanup_func (default none) - optional special-case\r

187

function to pass result through after cleanup_regexps\r

188

\r

189

cvtOut_regexps (default none) - optional list of\r

190

(search,replace) regular expressions to "clean up"\r

191

before starting to convert OUT of this format\r

192

cvtOut_func (default none) - optional special-case\r

193

function to pass through before any cvtOut_regexps\r

194

\r

195

inline_format (default "%s") the format string for\r

196

printing a word with --phones or --phones2phones\r

197

(can be used to put markup around each word)\r

198

(can also be a function taking the phonetic word\r

199

and returning the resulting string, e.g. bbcmicro)\r

200

\r

201

output_is_binary (default False) - True if the output\r

202

is almost certainly unsuitable for a terminal; will\r

203

cause lexconvert to refuse to print phonemes unless\r

204

its standard output is redirected to a file or pipe\r

205

(affects the --phones and --phones2phones options)\r

206

\r

207

inline_header (default none) text to print first\r

208

when outputting from --phones or --phones2phones\r

209

inline_footer (default none) text to print last\r

210

inline_oneoff_header (default none) text to print\r

211

before inline_header on the first time only\r

212

\r

213

lex_filename - filename of a lexicon file. If this\r

214

is not specified, there is no support for writing a\r

215

lexicon in this format: there can still be READ\r

216

support if you define lex_read_function to open the\r

217

lexicon by itself, but otherwise the format can be\r

218

used only with --phones and --phones2phones.\r

219

\r

220

lex_entry_format - format string for writing each\r

221

(word, pronunciation) entry to the lexicon file.\r

222

This is also needed for lexicon-write support.\r

223

\r

224

lex_header, lex_footer - optional strings to write\r

225

at the beginning and at the end of the lexicon file\r

226

(can also be functions that take the open file as a\r

227

parameter, e.g. for bbcmicro; lex_footer is\r

228

allowed to close the file if it needs to do\r

229

something with it afterwards)\r

230

\r

231

lex_word_case - optional "upper" or "lower" to\r

232

force a particular case for lexicon words (not\r

233

pronunciations - they're determined by the table).\r

234

The default is to allow words to be in either case.\r

235

\r

236

lex_type (default "") - used by the --formats\r

237

option when summarising the support for each format\r

238

\r

239

lex_read_function - Python function to READ the\r

240

lexicon file and return a (word,phonemes) list.\r

241

If this is not specified, there's no read support\r

242

for lexicons in this format (but there can still be\r

243

write support - see above - and you can still use\r

244

--phones and --phones2phones). If lex_filename is\r

245

specified then this function will be given the open\r

246

file as a parameter. """\r

247

\r

248

phonemes = Phonemes() ; globals().update(phonemes)\r

249

return { "festival" : makeDic(\r

250

"Festival's British voice",\r

251

('0',syllable_separator),\r

252

('1',primary_stress),\r

253

('2',secondary_stress),\r

254

('aa',a_as_in_ah),\r

255

('a',a_as_in_apple),\r

256

('uh',u_as_in_but),\r

257

('o',o_as_in_orange),\r

258

('au',o_as_in_now),\r

259

('@',a_as_in_ago),\r

260

('@@',e_as_in_herd),\r

('ai',eye),\r

('b',b),\r

('ch',ch),\r

('d',d),\r

('dh',th_as_in_them),\r

266

('e',e_as_in_them),\r

267

(ar_as_in_year,'@@',False),\r

268

('e@',a_as_in_air),\r

269

('ei',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('i',i_as_in_it),\r

('i@',ear),\r

('ii',e_as_in_eat),\r

276

('jh',j_as_in_jump),\r

('k',k),\r

('l',l),\r

('m',m),\r

('n',n),\r

('ng',ng),\r

('ou',o_as_in_go),\r

('oi',oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('s',s),\r

('sh',sh),\r

('t',t),\r

('th',th_as_in_think),\r

290

('u@',oor_as_in_poor),\r

291

('u',opt_u_as_in_pull),\r

292

('uu',oo_as_in_food),\r

293

('oo',close_to_or),\r

('v',v),\r

('w',w),\r

('y',y),\r

('z',z),\r

('zh',ge_of_blige_etc),\r

299

lex_filename=ifset("HOME",os.environ.get("HOME","")+os.sep)+".festivalrc",\r

300

lex_entry_format="(lex.add.entry '( \"%s\" n %s))\n",\r

301

lex_header=";; -*- mode: lisp -*-\n(eval (list voice_default))\n",\r

302

lex_read_function = lambda *args:eval('['+getoutput("grep -vi parameter.set < ~/.festivalrc | grep -v '(eval' | sed -e 's/;.*//' -e 's/.lex.add.entry//' -e s/\"'\"'[(] *\"/[\"/' -e 's/\" [^ ]* /\",(\"/' -e 's/\".*$/&\"],/' -e 's/[()]/ /g' -e 's/ */ /g'")+']'),\r

303

safe_to_drop_characters=True, # TODO: really? (could instead give a string of known-safe characters)\r

304

cleanup_func = festival_group_stress,\r

305

),\r

306

\r

307

"example" : makeVariantDic(\r

308

"A small built-in example lexicon for testing when you don't have your full custom lexicon to hand. Use --convert to write it in one of the other formats and see if a synth can import it.",\r

309

lex_read_function = lambda *args: [\r

310

("Shadrach","shei1drak"),\r

311

("Meshach","mii1shak"),\r

312

("Abednego","@be1dniigou"),\r

313

], cleanup_func = None,\r

314

lex_filename=None, lex_entry_format=None, noInherit=True),\r

315

\r

316

"festival-cmu" : makeVariantDic(\r

317

"American CMU version of Festival",\r

318

('ae',a_as_in_apple),\r

319

('ah',u_as_in_but),\r

320

('ax',a_as_in_ago),\r

321

(o_as_in_orange,'aa',False),\r

322

('aw',o_as_in_now),\r

323

('er',e_as_in_herd), # TODO: check this one\r

324

('ay',eye),\r

325

('eh',e_as_in_them),\r

326

(ar_as_in_year,'er',False),\r

327

(a_as_in_air,'er',False),\r

328

('ey',a_as_in_ate),\r

('hh',h),\r

('ih',i_as_in_it),\r

('ey ah',ear),\r

('iy',e_as_in_eat),\r

333

('ow',o_as_in_go),\r

334

('oy',oy_as_in_toy),\r

335

('uh',oor_as_in_poor),\r

336

('uw',oo_as_in_food),\r

337

('ao',close_to_or),\r

338

),\r

339

\r

340

"espeak" : makeDic(\r

341

"eSpeak's default British voice", # but eSpeak's phoneme representation isn't always that simple, hence the regexps at the end\r

342

('%',syllable_separator),\r

343

("'",primary_stress),\r

344

(',',secondary_stress),\r

345

# TODO: glottal_stop? (in regional pronunciations etc)\r

346

('A:',a_as_in_ah),\r

347

('A@',a_as_in_ah,False),\r

348

('A',var1_a_as_in_ah),\r

349

('a',a_as_in_apple),\r

350

('aa',a_as_in_apple,False),\r

351

('a2',a_as_in_apple,False), # TODO: this is actually an a_as_in_apple variant in espeak; festival @1 is not in mrpa PhoneSet\r

352

('&',a_as_in_apple,False),\r

353

('V',u_as_in_but),\r

354

('0',o_as_in_orange),\r

355

('aU',o_as_in_now),\r

356

('@',a_as_in_ago),\r

357

('a#',a_as_in_ago,False), # (TODO: eSpeak sometimes uses a# in 'had' when in a sentence, and this doesn't always sound good on other synths; might sometimes want to convert it to a_as_in_apple; not sure what contexts would call for this though)\r

358

('3:',e_as_in_herd),\r

359

('3',var1_a_as_in_ago),\r

360

('@2',a_as_in_ago,False),\r

361

('@-',a_as_in_ago,False), # (eSpeak @- sounds to me like a shorter version of @, TODO: double-check the relationship between @ and @2 in Festival)\r

('aI',eye),\r

('aI2',eye,False),\r

('aI;',eye,False),\r

('aI2;',eye,False),\r

('b',b),\r

('tS',ch),\r

('d',d),\r

('D',th_as_in_them),\r

370

('E',e_as_in_them),\r

371

(ar_as_in_year,'3:',False),\r

372

('e@',a_as_in_air),\r

373

('eI',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('I',i_as_in_it),\r

('I;',i_as_in_it,False),\r

379

('i',i_as_in_it,False),\r

380

('I2',var2_i_as_in_it,False),\r

381

('I2;',var2_i_as_in_it,False),\r

382

('i@',ear),\r

383

('i@3',var2_ear),\r

384

('i:',e_as_in_eat),\r

385

('i:;',e_as_in_eat,False),\r

386

('dZ',j_as_in_jump),\r

387

('k',k),\r

388

('x',opt_scottish_loch),\r

('l',l),\r

('L',l,False),\r

('m',m),\r

('n',n),\r

('N',ng),\r

('oU',o_as_in_go),\r

('oUl',opt_ol_as_in_gold), # (espeak says "gold" in a slightly 'posh' way though) (if dest format doesn't have opt_ol_as_in_gold, it'll get o_as_in_go + the l)\r

396

('OI',oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('r-',r,False),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('T',th_as_in_think),\r

404

('U@',oor_as_in_poor),\r

405

('U',opt_u_as_in_pull),\r

406

('@5',opt_u_as_in_pull,False),\r

407

('Ul',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the U, then the l\r

408

('u:',oo_as_in_food),\r

409

('O:',close_to_or),\r

410

('O@',var3_close_to_or),\r

411

('o@',var3_close_to_or,False),\r

412

('O',var3_close_to_or,False),\r

('v',v),\r

('w',w),\r

('j',y),\r

('z',z),\r

('Z',ge_of_blige_etc),\r

418

lex_filename = "en_extra",\r

419

lex_entry_format = "%s %s\n",\r

420

lex_read_function = lambda lexfile: [x for x in [l.split()[:2] for l in lexfile.readlines()] if len(x)==2 and not '//' in x[0]],\r

421

lex_footer=lambda f:(f.close(),os.system("espeak --compile=en")), # see also a bit of special-case code in mainopt_convert\r

422

inline_format = "[[%s]]",\r

423

word_separator=" ",phoneme_separator="",\r

424

stress_comes_before_vowel=True,\r

425

safe_to_drop_characters="_: !",\r

cleanup_regexps=[\r

("k'a2n","k'@n"),\r

("ka2n","k@n"),\r

("gg","g"),\r

("@U","oU"), # (eSpeak uses oU to represent @U; difference is given by its accent parameters)\r

431

("([iU]|([AO]:))@r$","\1@"),\r

432

("([^e])@r",r"\1_remove_3"),("_remove_",""),\r

433

# (r"([^iU]@)l",r"\1L") # only in older versions of espeak (not valid in more recent versions)\r

434

("rr$","r"),\r

435

("3:r$","3:"),\r

436

("%%+","%"),("^%",""),("%$",""),\r

437

# TODO: 'declared' & 'declare' the 'r' after the 'E' sounds a bit 'regional' (but pretty). but sounds incomplete w/out 'r', and there doesn't seem to be an E2 or E@\r

438

# TODO: consider adding 'g' to words ending in 'N' (if want the 'g' pronounced in '-ng' words) (however, careful of words like 'yankee' where the 'g' would be followed by a 'k'; this may also be a problem going into the next word)\r

439

],\r

440

cvtOut_regexps = [\r

441

("e@r$","e@"), ("e@r([bdDfghklmnNprsStTvwjzZ])",r"e@\1"), # because the 'r' is implicit in other synths (but DO have it if there's another vowel to follow)\r

],\r

),\r

\r

"sapi" : makeDic(\r

"Microsoft Speech API (American English)",\r

447

('-',syllable_separator),\r

448

('1',primary_stress),\r

449

('2',secondary_stress),\r

450

('aa',a_as_in_ah),\r

451

('ae',a_as_in_apple),\r

452

('ah',u_as_in_but),\r

453

('ao',o_as_in_orange),\r

454

('aw',o_as_in_now),\r

455

('ax',a_as_in_ago),\r

456

('er',e_as_in_herd),\r

('ay',eye),\r

('b',b),\r

('ch',ch),\r

('d',d),\r

('dh',th_as_in_them),\r

462

('eh',e_as_in_them),\r

463

('ey',var1_e_as_in_them),\r

464

(a_as_in_ate,'ey',False),\r

465

('f',f),\r

466

('g',g),\r

467

('h',h), # Jan suggested 'hh', but I can't get this to work on Windows XP (TODO: try newer versions of Windows)\r

468

('ih',i_as_in_it),\r

469

('iy',e_as_in_eat),\r

470

('jh',j_as_in_jump),\r

('k',k),\r

('l',l),\r

('m',m),\r

('n',n),\r

('ng',ng),\r

('ow',o_as_in_go),\r

('oy',oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('s',s),\r

('sh',sh),\r

('t',t),\r

('th',th_as_in_think),\r

484

('uh',oor_as_in_poor),\r

485

('uw',oo_as_in_food),\r

486

('AO',close_to_or),\r

487

('v',v),\r

488

('w',w),\r

489

# ('x',var1_w), # suggested by Jan, but I can't get this to work on Windows XP (TODO: try newer versions of Windows)\r

490

('y',y),\r

491

('z',z),\r

492

('zh',ge_of_blige_etc),\r

493

approximate_missing=True,\r

494

lex_filename="run-ptts.bat", # write-only for now\r

495

lex_header = "rem You have to run this file\nrem with ptts.exe in the same directory\nrem to add these words to the SAPI lexicon\n\n",\r

496

lex_entry_format='ptts -la %s "%s"\n',\r

497

inline_format = '<pron sym="%s"/>',\r

498

safe_to_drop_characters=True, # TODO: really?\r

499

),\r

500

\r

501

"cepstral" : makeDic(\r

502

"Cepstral's British English SSML phoneset",\r

503

('0',syllable_separator),\r

504

('1',primary_stress),\r

505

('a',a_as_in_ah),\r

506

('ae',a_as_in_apple),\r

507

('ah',u_as_in_but),\r

508

('oa',o_as_in_orange),\r

509

('aw',o_as_in_now),\r

510

('er',e_as_in_herd),\r

('ay',eye),\r

('b',b),\r

('ch',ch),\r

('d',d),\r

('dh',th_as_in_them),\r

516

('eh',e_as_in_them),\r

517

('e@',a_as_in_air),\r

518

('ey',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('ih',i_as_in_it),\r

('i',e_as_in_eat),\r

('jh',j_as_in_jump),\r

('k',k),\r

('l',l),\r

('m',m),\r

('n',n),\r

('ng',ng),\r

('ow',o_as_in_go),\r

('oy',oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('s',s),\r

('sh',sh),\r

('t',t),\r

('th',th_as_in_think),\r

538

('uh',oor_as_in_poor),\r

539

('uw',oo_as_in_food),\r

540

('ao',close_to_or),\r

('v',v),\r

('w',w),\r

('j',y),\r

('z',z),\r

('zh',ge_of_blige_etc),\r

546

approximate_missing=True,\r

547

lex_filename="lexicon.txt",\r

548

lex_entry_format = "%s 0 %s\n",\r

549

lex_read_function = lambda lexfile: [(word,pronunc) for word, ignore, pronunc in [l.split(None,2) for l in lexfile.readlines()]],\r

550

lex_word_case = "lower",\r

551

inline_format = "<phoneme ph='%s'>p</phoneme>",\r

552

safe_to_drop_characters=True, # TODO: really?\r

553

cleanup_regexps=[(" 1","1"),(" 0","0")],\r

),\r

\r

"mac" : makeDic(\r

"approximation in American English using the [[inpt PHON]] notation of Apple's US voices",\r

558

('=',syllable_separator),\r

559

('1',primary_stress),\r

560

('2',secondary_stress),\r

561

('AA',a_as_in_ah),\r

562

('aa',var5_a_as_in_ah),\r

563

('AE',a_as_in_apple),\r

564

('UX',u_as_in_but),\r

565

(o_as_in_orange,'AA',False),\r

566

('AW',o_as_in_now),\r

567

('AX',a_as_in_ago),\r

568

(e_as_in_herd,'AX',False), # TODO: is this really the best approximation?\r

('AY',eye),\r

('b',b),\r

('C',ch),\r

('d',d),\r

('D',th_as_in_them),\r

574

('EH',e_as_in_them),\r

575

('EY',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('IH',i_as_in_it),\r

('IX',var2_i_as_in_it),\r

581

('IY',e_as_in_eat),\r

582

('J',j_as_in_jump),\r

('k',k),\r

('l',l),\r

('m',m),\r

('n',n),\r

('N',ng),\r

('OW',o_as_in_go),\r

('OY',oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('T',th_as_in_think),\r

596

('UH',oor_as_in_poor),\r

597

('UW',oo_as_in_food),\r

598

('AO',close_to_or),\r

('v',v),\r

('w',w),\r

('y',y),\r

('z',z),\r

('Z',ge_of_blige_etc),\r

604

approximate_missing=True,\r

605

lex_filename="substitute.sh", # write-only for now\r

606

lex_type = "substitution script",\r

607

lex_header = "#!/bin/bash\n\n# I don't yet know how to add to the Apple US lexicon,\n# so here is a 'sed' command you can run on your text\n# to put the pronunciation inline:\n\nsed -E -e :S \\\n",\r

608

lex_entry_format=r" -e 's/(^|[^A-Za-z])%s($|[^A-Za-z[12=])/\1[[inpt PHON]]%s[[inpt TEXT]]\2/g'"+" \\\n",\r

609

# but /g is non-overlapping matches and won't catch 2 words in the lex right next to each other with only one non-alpha in between, so we put :S at start and tS at end to make the whole operation repeat until it hasn't done any more substitutions (hence also the exclusion of [, 1, 2 or = following a word so it doesn't try to substitute stuff inside the phonemes; TODO: assert the lexicon does not contain "inpt", "PHON" or "TEXT")\r

610

lex_footer = lambda f:(f.write(" -e tS\n"),f.close(),os.chmod("substitute.sh",493)), # 493 = 0755, but no way to specify octal that works on both Python 2.5 and Python 3 (0o works on 2.6+)\r

611

inline_format = "[[inpt PHON]]%s[[inpt TEXT]]",\r

612

word_separator=" ",phoneme_separator="",\r

613

safe_to_drop_characters=True, # TODO: really?\r

614

),\r

615

\r

616

"mac-uk" : makeDic(\r

617

"Scansoft/Nuance British voices in Mac OS 10.7+ (system lexicon editing required, see --mac-uk option)",\r

618

('.',syllable_separator),\r

619

("'",primary_stress),\r

620

(secondary_stress,'',False),\r

621

('A',a_as_in_ah),\r

622

('@',a_as_in_apple),\r

623

('$',u_as_in_but),\r

624

(a_as_in_ago,'$',False),\r

625

('A+',o_as_in_orange),\r

626

('a&U',o_as_in_now),\r

627

('E0',e_as_in_herd),\r

('a&I',eye),\r

('b',b),\r

('t&S',ch),\r

('d',d),\r

('D',th_as_in_them),\r

633

('E',e_as_in_them),\r

634

('0',ar_as_in_year),\r

635

('E&$',a_as_in_air),\r

636

('e&I',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('I',i_as_in_it),\r

('I&$',ear),\r

('i',e_as_in_eat),\r

('d&Z',j_as_in_jump),\r

('k',k),\r

('l',l),\r

('m',m),\r

('n',n),\r

('nK',ng),\r

('o&U',o_as_in_go),\r

650

('O&I',oy_as_in_toy),\r

('p',p),\r

('R+',r),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('T',th_as_in_think),\r

657

('O',oor_as_in_poor),\r

658

('U',opt_u_as_in_pull),\r

659

('u',oo_as_in_food),\r

660

(close_to_or,'O',False),\r

('v',v),\r

('w',w),\r

('j',y),\r

('z',z),\r

('Z',ge_of_blige_etc),\r

666

# lex_filename not set (mac-uk code does not permanently save the lexicon; see --mac-uk option to read text)\r

667

lex_read_function = lambda *args:[(w,p) for w,_,p in MacBritish_System_Lexicon(False,os.environ.get("MACUK_VOICE","Daniel")).usable_words()],\r

668

inline_oneoff_header = "(mac-uk phonemes output is for information only; you'll need the --mac-uk or --trymac-uk options to use it)\n",\r

669

word_separator=" ",phoneme_separator="",\r

670

stress_comes_before_vowel=True,\r

671

safe_to_drop_characters=True, # TODO: really?\r

672

cleanup_regexps=[(r'o\&U\.Ol', r'o\&Ul')],\r

673

),\r

674

\r

675

"x-sampa" : makeDic(\r

676

"General X-SAMPA notation, contributed by Jan Weiss",\r

677

('.',syllable_separator),\r

678

('"',primary_stress),\r

679

('%',secondary_stress),\r

680

('A',a_as_in_ah),\r

681

(':',ipa_colon),\r

682

('A:',var3_a_as_in_ah),\r

683

('Ar\\',var4_a_as_in_ah),\r

684

('a:',var5_a_as_in_ah),\r

685

('{',a_as_in_apple),\r

686

('V',u_as_in_but),\r

687

('Q',o_as_in_orange),\r

688

(var1_o_as_in_orange,'A',False),\r

689

('O',var2_o_as_in_orange),\r

690

('aU',o_as_in_now),\r

691

('{O',var1_o_as_in_now),\r

692

('@',a_as_in_ago),\r

693

('3:',e_as_in_herd),\r

('aI',eye),\r

('Ae',var1_eye),\r

('b',b),\r

('tS',ch),\r

('d',d),\r

('D',th_as_in_them),\r

700

('E',e_as_in_them),\r

701

('e',var1_e_as_in_them),\r

702

(ar_as_in_year,'3:',False),\r

703

('E@',a_as_in_air),\r

704

('Er\\',var1_a_as_in_air),\r

705

('e:',var2_a_as_in_air),\r

706

('E:',var3_a_as_in_air),\r

707

('e@',var4_a_as_in_air),\r

708

('eI',a_as_in_ate),\r

709

('{I',var1_a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('I',i_as_in_it),\r

('1',var1_i_as_in_it),\r

('I@',ear),\r

('Ir\\',var1_ear),\r

('i',e_as_in_eat),\r

('i:',var1_e_as_in_eat),\r

719

('dZ',j_as_in_jump),\r

720

('k',k),\r

721

('x',opt_scottish_loch),\r

('l',l),\r

('m',m),\r

('n',n),\r

('N',ng),\r

('@U',o_as_in_go),\r

('oU',var2_o_as_in_go),\r

728

('@}',var1_u_as_in_but),\r

729

('OI',oy_as_in_toy),\r

730

('oI',var1_oy_as_in_toy),\r

731

('p',p),\r

732

('r\\',r),\r

733

(var1_r,'r',False),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('T',th_as_in_think),\r

738

('U@',oor_as_in_poor),\r

739

('Ur\\',var1_oor_as_in_poor),\r

740

('U',opt_u_as_in_pull),\r

741

('}:',oo_as_in_food),\r

742

('u:',var1_oo_as_in_food),\r

743

(var2_oo_as_in_food,'u:',False),\r

744

('O:',close_to_or),\r

745

(var1_close_to_or,'O',False),\r

746

('o:',var2_close_to_or),\r

('v',v),\r

('w',w),\r

('W',var1_w),\r

('j',y),\r

('z',z),\r

('Z',ge_of_blige_etc),\r

753

lex_filename="acapela.txt",\r

754

lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: may be able to convert part-of-speech (NOUN etc) to/from some other formats e.g. Festival\r

755

lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],\r

756

# TODO: inline_format ?\r

757

word_separator=" ",phoneme_separator="",\r

758

safe_to_drop_characters=True, # TODO: really?\r

759

),\r

760

"vocaloid" : makeVariantDic(\r

761

"X-SAMPA phonemes for Yamaha's Vocaloid singing synthesizer. Contributed by Lorenzo Gatti, who tested in Vocaloid 4 using two American English voices.",\r

762

('-',syllable_separator),\r

763

(primary_stress,'',False), # not used by Vocaloid\r

764

(secondary_stress,'',False),\r

765

('Q',a_as_in_ah),\r

766

(var3_a_as_in_ah,'Q',False),\r

767

(var4_a_as_in_ah,'Q',False),\r

768

(var5_a_as_in_ah,'Q',False),\r

769

('O@',o_as_in_orange),\r

770

(var1_o_as_in_orange,'O@',False),\r

771

(var2_o_as_in_orange, 'O@',False),\r

772

('@U',o_as_in_now),\r

773

('@r',e_as_in_herd),\r

774

(var1_eye, 'aI',False),\r

775

('e',e_as_in_them),\r

776

('I@',ar_as_in_year),\r

777

('e@',a_as_in_air),\r

778

(var1_a_as_in_air, 'e@',False),\r

779

(var2_a_as_in_air, 'e@',False),\r

780

(var3_a_as_in_air, 'e@',False),\r

781

(var4_a_as_in_air, 'e@',False),\r

782

(var1_a_as_in_ate, 'eI', False),\r

783

(var1_i_as_in_it, 'I',False),\r

784

(var1_ear, 'I@',False),\r

785

('i:',e_as_in_eat),\r

786

(var1_e_as_in_eat, 'i:',False),\r

787

(var2_o_as_in_go, '@U', False),\r

788

('V', var1_u_as_in_but),\r

789

(var1_oy_as_in_toy, 'OI',False),\r

790

('r',r),\r

791

('th',t),\r

792

(var1_oor_as_in_poor, '@U',False),\r

793

('u:',oo_as_in_food),\r

794

(var1_oo_as_in_food, 'u:',False),\r

795

(var1_close_to_or,'O:',False),\r

796

(var2_close_to_or,'O:',False),\r

797

(var1_w, 'w', False),\r

798

lex_filename="vocaloid.txt",\r

799

phoneme_separator=" ",\r

800

noInherit=True\r

801

),\r

802

"android-pico" : makeVariantDic(\r

803

'X-SAMPA phonemes for the default \"Pico\" voice in Android (1.6+, American), wrapped in Java code', # you could put en-GB instead of en-US, but it must be installed on the phone\r

804

('A:',a_as_in_ah), # won't sound without the :\r

805

(var5_a_as_in_ah,'A:',False), # a: won't sound\r

806

('@U:',o_as_in_go),\r

807

('I',var1_i_as_in_it), # '1' won't sound\r

808

('i:',e_as_in_eat), # 'i' won't sound\r

809

('u:',oo_as_in_food), # }: won't sound\r

810

('a_I',eye),('a_U',o_as_in_now),('e_I',a_as_in_ate),('O_I',oy_as_in_toy),(var1_oy_as_in_toy,'O_I',False),('o_U',var2_o_as_in_go),\r

811

cleanup_regexps=[(r'\\',r'\\\\'),('"','"'),('::',':')],\r

812

lex_filename="",lex_entry_format="",\r

813

lex_read_function=None,\r

814

inline_oneoff_header=r'class Speak { public static void speak(android.app.Activity a,String s) { class OnInit implements android.speech.tts.TextToSpeech.OnInitListener { public OnInit(String s) { this.s = s; } public void onInit(int i) { mTts.speak(this.s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private String s; }; if(mTts==null) mTts=new android.speech.tts.TextToSpeech(a,new OnInit(s),"com.svox.pico"); else mTts.speak(s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private static android.speech.tts.TextToSpeech mTts = null; };'+'\n',\r

815

inline_header=r'Speak.speak(this,"<speak xml:lang=\"en-US\">',\r

816

inline_format=r'<phoneme alphabet=\"xsampa\" ph=\"%s\"/>',\r

817

clause_separator=r".\n", # note r"\n" != "\n"\r

818

inline_footer='</speak>");',\r

819

),\r

820

\r

821

"acapela-uk" : makeDic(\r

822

'Acapela-optimised X-SAMPA for UK English voices (e.g. "Peter"), contributed by Jan Weiss',\r

823

('.',syllable_separator),('"',primary_stress),('%',secondary_stress), # copied from "x-sampa", not tested\r

824

('A:',a_as_in_ah),\r

825

('{',a_as_in_apple),\r

826

('V',u_as_in_but),\r

827

('Q',o_as_in_orange),\r

828

('A',var1_o_as_in_orange),\r

829

('O',var2_o_as_in_orange),\r

830

('aU',o_as_in_now),\r

831

('{O',var1_o_as_in_now),\r

832

('@',a_as_in_ago),\r

833

('3:',e_as_in_herd),\r

('aI',eye),\r

('A e',var1_eye),\r

('b',b),\r

('t S',ch),\r

('d',d),\r

('D',th_as_in_them),\r

840

('e',e_as_in_them),\r

841

(ar_as_in_year,'3:',False),\r

842

('e @',a_as_in_air),\r

843

('e r',var1_a_as_in_air),\r

844

('e :',var2_a_as_in_air),\r

845

(var3_a_as_in_air,'e :',False),\r

846

('eI',a_as_in_ate),\r

847

('{I',var1_a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('I',i_as_in_it),\r

('1',var1_i_as_in_it),\r

('I@',ear),\r

('I r',var1_ear),\r

('i',e_as_in_eat),\r

('i:',var1_e_as_in_eat),\r

857

('dZ',j_as_in_jump),\r

858

('k',k),\r

859

('x',opt_scottish_loch),\r

('l',l),\r

('m',m),\r

('n',n),\r

('N',ng),\r

('@U',o_as_in_go),\r

('o U',var2_o_as_in_go),\r

866

('@ }',var1_u_as_in_but),\r

867

('OI',oy_as_in_toy),\r

868

('o I',var1_oy_as_in_toy),\r

('p',p),\r

('r',r),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('T',th_as_in_think),\r

875

('U@',oor_as_in_poor),\r

876

('U r',var1_oor_as_in_poor),\r

877

('U',opt_u_as_in_pull),\r

878

('u:',oo_as_in_food),\r

879

('O:',close_to_or),\r

880

(var1_close_to_or,'O',False),\r

('v',v),\r

('w',w),\r

('j',y),\r

('z',z),\r

('Z',ge_of_blige_etc),\r

886

lex_filename="acapela.txt",\r

887

lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: part-of-speech (as above)\r

888

lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],\r

889

inline_format = "\\Prn=%s\\",\r

890

safe_to_drop_characters=True, # TODO: really?\r

),\r

\r

"cmu" : makeDic(\r

'format of the US-English Carnegie Mellon University Pronouncing Dictionary, contributed by Jan Weiss', # http://www.speech.cs.cmu.edu/cgi-bin/cmudict\r

895

('0',syllable_separator),\r

896

('1',primary_stress),\r

897

('2',secondary_stress),\r

898

('AA',a_as_in_ah),\r

899

(var1_a_as_in_ah,'2',False),\r

900

(ipa_colon,'1',False),\r

901

('AE',a_as_in_apple),\r

902

('AH',u_as_in_but),\r

903

(o_as_in_orange,'AA',False),\r

904

('AW',o_as_in_now),\r

905

(a_as_in_ago,'AH',False), # seems they don't use AX as festival-cmu does\r

906

('ER',e_as_in_herd), # TODO: check this one\r

('AY',eye),\r

('B',b),\r

('CH',ch),\r

('D',d),\r

('DH',th_as_in_them),\r

912

('EH',e_as_in_them),\r

913

(ar_as_in_year,'ER',False),\r

914

(a_as_in_air,'ER',False),\r

915

('EY',a_as_in_ate),\r

('F',f),\r

('G',g),\r

('HH',h),\r

('IH',i_as_in_it),\r

('EY AH',ear),\r

('IY',e_as_in_eat),\r

922

('JH',j_as_in_jump),\r

('K',k),\r

('L',l),\r

('M',m),\r

('N',n),\r

('NG',ng),\r

('OW',o_as_in_go),\r

('OY',oy_as_in_toy),\r

('P',p),\r

('R',r),\r

('S',s),\r

('SH',sh),\r

('T',t),\r

('TH',th_as_in_think),\r

936

('UH',oor_as_in_poor),\r

937

('UW',oo_as_in_food),\r

938

('AO',close_to_or),\r

('V',v),\r

('W',w),\r

('Y',y),\r

('Z',z),\r

('ZH',ge_of_blige_etc),\r

944

# lex_filename not set (does CMU have a lex file?)\r

945

safe_to_drop_characters=True, # TODO: really?\r

946

),\r

947

\r

948

# BEGIN PRE-32bit ERA SYNTHS (TODO: add an attribute to JS-hide them by default in HTML? what about the SpeakJet which probably isn't a 32-bit chip but is post 32-bit era? and then what about the 'approximation' formats - kana etc - would they need hiding by default also? maybe best to just leave it)\r

949

"apollo" : makeDic(\r

950

'Dolphin Apollo 2 serial-port and parallel-port hardware synthesizers (in case anybody still uses those)',\r

951

(syllable_separator,'',False), # I don't think the Apollo had anything to mark stress; TODO: control the pitch instead like bbcmicro ?\r

952

('_QQ',syllable_separator,False), # a slight pause\r

953

('_AA',a_as_in_apple),\r

954

('_AI',a_as_in_ate),\r

955

('_AR',a_as_in_ah),\r

956

('_AW',close_to_or),\r

957

('_A',a_as_in_ago),\r

('_B',b),\r

('_CH',ch),\r

('_D',d),\r

('_DH',th_as_in_them),\r

962

('_EE',e_as_in_eat),\r

963

('_EI',a_as_in_air),\r

964

('_ER',e_as_in_herd),\r

965

('_E',e_as_in_them),\r

('_F',f),\r

('_G',g),\r

('_H',h),\r

('_IA',ear),\r

('_IE',eye),\r

('_I',i_as_in_it),\r

('_J',j_as_in_jump),\r

973

('_K',k),\r

974

('_KK',k,False), # sCHool\r

('_L',l),\r

('_M',m),\r

('_NG',ng),\r

('_N',n),\r

('_OA',o_as_in_go),\r

980

('_OO',opt_u_as_in_pull),\r

981

('_OR',var3_close_to_or),\r

982

('_OW',o_as_in_now),\r

983

('_OY',oy_as_in_toy),\r

984

('_O',o_as_in_orange),\r

985

('_P',p),\r

986

('_PP',p,False), # sPeech (a stronger P ?)\r

987

# _Q = k w - done by cleanup_regexps below\r

('_R',r),\r

('_SH',sh),\r

('_S',s),\r

('_TH',th_as_in_think),\r

992

('_T',t), ('_TT',t,False),\r

993

('_UU',oo_as_in_food),\r

994

('_U',u_as_in_but),\r

995

('_V',v),\r

996

('_W',w),\r

997

# _X = k s - done by cleanup_regexps below\r

998

('_Y',y),\r

999

('_ZH',ge_of_blige_etc),\r

1000

('_Z',z),\r

1001

# lex_filename not set (the hardware doesn't have one; HAL has an "exceptions dictionary" but I don't know much about it)\r

1002

approximate_missing=True,\r

1003

safe_to_drop_characters=True, # TODO: really?\r

1004

word_separator=" ",phoneme_separator="",\r

1005

cleanup_regexps=[('_K_W','_Q'),('_K_S','_X')],\r

1006

cvtOut_regexps=[('_Q','_K_W'),('_X','_K_S')],\r

1007

),\r

1008

"dectalk" : makeDic(\r

1009

'DECtalk hardware synthesizers (American English)', # (1984-ish serial port; later ISA cards)\r

1010

(syllable_separator,'',False),\r

1011

("'",primary_stress),\r

1012

('aa',o_as_in_orange),\r

1013

('ae',a_as_in_apple),\r

1014

('ah',u_as_in_but),\r

1015

('ao',close_to_or), # bought\r

1016

('aw',o_as_in_now),\r

1017

('ax',a_as_in_ago),\r

('ay',eye),\r

('b',b),\r

('ch',ch),\r

('d',d), ('dx',d,False),\r

1022

('dh',th_as_in_them),\r

1023

('eh',e_as_in_them),\r

1024

('el',l,False), # -le of bottle, allophone ?\r

1025

# TODO: en: -on of button (2 phonemes?)\r

1026

('ey',a_as_in_ate),\r

('f',f),\r

('g',g),\r

('hx',h),\r

('ih',i_as_in_it), ('ix',i_as_in_it,False),\r

1031

('iy',e_as_in_eat), ('q',e_as_in_eat,False),\r

1032

('jh',j_as_in_jump),\r

1033

('k',k),\r

1034

('l',l), ('lx',l,False),\r

('m',m),\r

('n',n),\r

('nx',ng),\r

('ow',o_as_in_go),\r

('oy',oy_as_in_toy),\r

1040

('p',p),\r

1041

('r',r), ('rx',r,False),\r

1042

('rr',e_as_in_herd),\r

1043

('s',s),\r

1044

('sh',sh),\r

1045

('t',t), ('tx',t,False),\r

1046

('th',th_as_in_think),\r

1047

('uh',opt_u_as_in_pull),\r

1048

('uw',oo_as_in_food),\r

('v',v),\r

('w',w),\r

('yx',y),\r

('z',z),\r

('zh',ge_of_blige_etc),\r

1054

('ihr',ear), # DECtalk makes this from ih + r\r

1055

approximate_missing=True,\r

1056

cleanup_regexps=[('yxuw','yu')], # TODO: other allophones ("x',False" stuff above)?\r

1057

cvtOut_regexps=[('yu','yxuw')],\r

1058

# lex_filename not set (depends on which model etc)\r

1059

stress_comes_before_vowel=True,\r

1060

safe_to_drop_characters=True, # TODO: really?\r

1061

word_separator=" ",phoneme_separator="",\r

1062

inline_header="[:phoneme on]\n",\r

1063

inline_format="[%s]",\r

1064

),\r

1065

"doubletalk" : makeDic(\r

1066

'DoubleTalk PC/LT serial-port hardware synthesizers (American English; assumes DOS driver by default, otherwise set DTALK_COMMAND_CODE to your current command-code binary value, e.g. export DTALK_COMMAND_CODE=1)', # (1 is the synth's default; the DOS driver lets you put * instead)\r

1067

(syllable_separator,'',False),\r

1068

("/",primary_stress), # TODO: check it doesn't need a balancing \ afterwards (docs do say it's a "temporary" change of pitch, but it's unclear how long a 'temporary')\r

1069

('M',m),('N',n),('NX',ng),('O',o_as_in_go),\r

1070

('OW',o_as_in_go,False), # allophone\r

1071

(o_as_in_orange,'O',False), # TODO: is this the best approximation we can do?\r

1072

('OY',oy_as_in_toy),('P',p),\r

1073

('R',r),('S',s),('SH',sh),('T',t),\r

1074

('TH',th_as_in_think),('V',v),('W',w),('Z',z),\r

1075

('ZH',ge_of_blige_etc),('K',k),('L',l),\r

1076

('PX',p,False), ('TX',t,False), # aspirated allophones\r

1077

('WH',w,False), ('KX',k,False), # ditto\r

1078

('YY',y),('Y',y,False),\r

1079

('UH',opt_u_as_in_pull),('UW',oo_as_in_food),\r

1080

('AA',a_as_in_ah),('AE',a_as_in_apple),\r

1081

('AH',u_as_in_but),('AO',close_to_or),\r

1082

('AW',o_as_in_now),('AX',a_as_in_ago),\r

1083

('AY',eye),('B',b),('CH',ch),('D',d),\r

1084

('DH',th_as_in_them),\r

1085

('DX',t,False), # an American "d"-like "t"\r

1086

('EH',e_as_in_them),('ER',e_as_in_herd),\r

1087

('EY',a_as_in_ate),('F',f),('G',g),('H',h),\r

1088

('IH',i_as_in_it),('IX',i_as_in_it,False),\r

1089

('IY',e_as_in_eat),('JH',j_as_in_jump),\r

1090

approximate_missing=True,\r

1091

stress_comes_before_vowel=True,\r

1092

inline_format=markup_doubleTalk_word,\r

1093

format_is_binary=ifset('DTALK_COMMAND_CODE',True),\r

1094

# DoubleTalk does have a loadable "exceptions dictionary" but usually relies on a DOS tool to write it; I don't have the documentation about it (and don't know how much RAM is available for it - it's taken from the input buffer)\r

1095

),\r

1096

"keynote" : makeDic(\r

1097

'Phoneme-read and lexicon-add codes for Keynote Gold hardware synthesizers (American English)', # ISA, PCMCIA, serial, etc; non-serial models give you an INT 2Fh param to get the address of an API function to call; not sure which software can send these codes directly to it)\r

1098

(syllable_separator,'',False),\r

1099

(primary_stress,"'"),(secondary_stress,'"'),\r

1100

('w',w),('y',y),('h',h),('m',m),('n',n),('ng',ng),\r

1101

('l',l),('r',r),('f',f),('v',v),('s',s),('z',z),\r

1102

('th',th_as_in_think),('dh',th_as_in_them),('k',k),\r

1103

('ch',ch),('zh',ge_of_blige_etc),('sh',sh),('g',g),\r

1104

('jh',j_as_in_jump),('b',b),('p',p),('d',d),('t',t),\r

1105

('i',e_as_in_eat),('I',i_as_in_it),\r

1106

('e',a_as_in_ate),('E',e_as_in_them),\r

1107

('ae',a_as_in_apple),('u',oo_as_in_food),\r

1108

('U',opt_u_as_in_pull),('o',o_as_in_go),\r

1109

('O',close_to_or),('a',o_as_in_orange),\r

1110

('^',u_as_in_but),('R',e_as_in_herd),\r

1111

('ay',eye),('Oy',oy_as_in_toy),('aw',o_as_in_now),\r

1112

('=',a_as_in_ago),\r

1113

approximate_missing=True,\r

1114

inline_format="[p]%s[t]",\r

1115

lex_filename="keynote.dat", # you have to somehow get this directly dumped to the card, see comment above\r

1116

lex_entry_format="[x]%s %s", lex_footer="[t]\n",\r

1117

stress_comes_before_vowel=False, # even though it's "'"\r

1118

),\r

1119

"audapter" : makeVariantDic(\r

1120

"Audapter Speech System, an old hardware serial/parallel-port synthesizer (American English)", # 1989 I think. The phonemes themselves are the same as the Keynote above, but there's an extra binary byte in the commands and the lex format is stricter. I haven't checked but my guess is Audapter came before Keynote.\r

1121

inline_format='\x05[p] %s\x05[t]',\r

1122

format_is_binary=True,\r

1123

lex_filename="audapter.dat",\r

1124

lex_entry_format="\x05[x]%s %s\x05[t]\n", lex_footer="",\r

1125

),\r

1126

"bbcmicro" : makeDic(\r

1127

"BBC Micro Speech program from 1985 (see comments in lexconvert.py for more details)",\r

1128

# Speech was written by David J. Hoskins and published by Superior Software. It took 7.5k of RAM including 3.1k of samples (49 phonemes + 1 for fricatives at 64 bytes each, 4-bit ~5.5kHz), 2.2k of lexicon, and 2.2k of machine code; sounds "retro" by modern standards but quite impressive for the BBC Micro in 1985. Samples are played by amplitude-modulating the BBC's tone generator.\r

1129

# If you use an emulator like BeebEm, you'll need diskimg/Speech.ssd. This can be made from your original Speech disc, or you might be able to find one but beware of copyright! Same goes with the ROM images included in BeebEm (you might want to delete ones you didn't have). There has been considerable discussion over whether UK copyright law does or should allow "format-shifting" your own legally-purchased media, and I don't fully understand all the discussion so I don't want to give advice on it here. The issue is "format-shifting" your legally-purchased BBC Micro ROM code and Speech disc to emulator images; IF this is all right then I suspect downloading someone else's copy is arguably allowed as long as you bought it legally "back in the day", but I'm not a solicitor so I don't know.\r

1130

# (Incidentally, yes I was the Silas Brown referred to in Beebug 11.1 p.59, 11.9 p.50/11.10 p.47 and 12.10 p.24, and, no, the question in the final issue wasn't quite how I put it, but all taken in good humour.)\r

1131

# lexconvert's --phones bbcmicro option creates *SPEAK commands which you can type into the BBC Micro or paste into an emulator, either at the BASIC prompt, or in a listing with line numbers provided by AUTO. You have to load the Speech program first of course.\r

1132

# To script this on BeebEm, first turn off the Speech disc's boot option (by turning off File / Disc options / Write protect and entering "*OPT 4,0"; use "*OPT 4,3" if you want it back later; if you prefer to edit the disk image outside of the emulator then change byte 0x106 from 0x33 to 0x03), and then you can do (e.g. on a Mac) open /usr/local/BeebEm3/diskimg/Speech.ssd && sleep 1 && (echo '*SPEECH';python lexconvert.py --phones bbcmicro "Greetings from 19 85") | pbcopy && osascript -e 'tell application "System Events" to keystroke "v" using command down'\r

1133

# or if you know it's already loaded: echo "Here is some text" | python lexconvert.py --phones bbcmicro | pbcopy && osascript -e 'tell application "BeebEm3" to activate' && osascript -e 'tell application "System Events" to keystroke "v" using command down'\r

1134

# (unfortunately there doesn't seem to be a way of doing it without giving the emulator window focus)\r

1135

# If you want to emulate a Master, you might need a *DISK before the *SPEECH (to take it out of ADFS mode).\r

1136

# You can also put Speech into ROM, but this can cause problems: see comments on SP8000 later.\r

1137

(syllable_separator,'',False),\r

1138

('4',primary_stress),\r

1139

('5',secondary_stress), # (these are pitch numbers on the BBC; normal pitch is 6, and lower numbers are higher pitches, so try 5=secondary and 4=primary; 3 sounds less calm)\r

1140

('AA',a_as_in_ah),\r

1141

('AE',a_as_in_apple),\r

1142

('AH',u_as_in_but),\r

1143

('O',o_as_in_orange),\r

1144

('AW',o_as_in_now),\r

1145

(a_as_in_ago,'AH',False),\r

1146

('ER',e_as_in_herd),\r

('IY',eye),\r

('B',b),\r

('CH',ch),\r

('D',d),\r

('DH',th_as_in_them),\r

1152

('EH',e_as_in_them),\r

1153

(ar_as_in_year,'ER',False),\r

1154

('AI',a_as_in_air),\r

1155

('AY',a_as_in_ate),\r

('F',f),\r

('G',g),\r

('/H',h),\r

('IH',i_as_in_it),\r

('IX',var2_i_as_in_it), # (IX sounds to me like a slightly shorter version of IH)\r

1161

('IXAH',ear),\r

1162

('EER',var2_ear), # e.g. 'hear', 'near' - near enough\r

1163

('EE',e_as_in_eat),\r

1164

('J',j_as_in_jump),\r

1165

('K',k),\r

1166

('C',k,False), # for CT as in "fact", read out as K+T\r

('L',l),\r

('M',m),\r

('N',n),\r

('NX',ng),\r

('OW',o_as_in_go),\r

('OL',opt_ol_as_in_gold), # (if dest format doesn't have this, it'll get o_as_in_orange from the O, then the l)\r

1173

('OY',oy_as_in_toy),\r

('P',p),\r

('R',r),\r

('S',s),\r

('SH',sh),\r

('T',t),\r

('TH',th_as_in_think),\r

1180

('AOR',oor_as_in_poor),\r

1181

('UH',oor_as_in_poor,False), # TODO: really? (espeak 'U' goes to opt_u_as_in_pull, and eSpeak also used U for the o in good, which sounds best with Speech's default UH4, hence the line below, but where did we get UH->oor_as_in_poor from? Low-priority though because how often do you convert OUT of bbcmicro format)\r

1182

(opt_u_as_in_pull,'UH',False),\r

1183

('/U',opt_u_as_in_pull,False),\r

1184

('/UL',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the /U, then l\r

1185

('UW',oo_as_in_food),\r

1186

('UX',oo_as_in_food,False),\r

1187

('AO',close_to_or),\r

('V',v),\r

('W',w),\r

('Y',y),\r

('Z',z),\r

('ZH',ge_of_blige_etc),\r

1193

lex_filename=ifset("MAKE_SPEECH_ROM","SPEECH.ROM","BBCLEX"),\r

1194

lex_entry_format=as_utf8("> %s_")+chr(128)+as_utf8("%s"), # (specifying 'whole word' for now; remove the space before or the _ after if you want)\r

1195

lex_read_function = lambda lexfile: [(w[0].lstrip().rstrip('_').lower(),w[1]) for w in filter(lambda x:len(x)==2,[w.split(chr(128)) for w in getBuf(lexfile).read().split('>')])], # TODO: this reads back the entries we generate, but is unlikely to work well with the wildcards in the default lexicon that would have been added if SPEECH_DISK was set (c.f. trying to read eSpeak's en_rules instead of en_extra)\r

1196

lex_word_case = "upper",\r

1197

lex_header = bbc_prepDefaultLex,\r

1198

lex_footer = bbc_appendDefaultLex, # + ">**"\r

1199

inline_format = markup_bbcMicro_word,\r

1200

word_separator=" ",phoneme_separator="",\r

1201

clause_separator=write_bbcmicro_phones, # special case\r

1202

safe_to_drop_characters=True, # TODO: really?\r

1203

cleanup_regexps=[\r

1204

('KT','CT'), # Speech instructions: "CT as in fact"\r

1205

('DYUW','DUX'), # "DUX as in duke"\r

1206

('AHR$','AH'), # usually sounds a bit better\r

1207

],\r

1208

cvtOut_regexps=[('DUX','DYUW')], # CT handled above\r

1209

),\r

1210

"bbcmicro-cc" : makeDic(\r

1211

"Computer Concepts Speech ROM which provided phonemes for the BBC Micro's TMS5220 \"speech chip\" add-on (less widely sold than the software-only product)", # (and harder to run on an emulator. It wasn't the only phoneme ROM, e.g. Easytalk Speech Utility ROM by Galaxy, reviewed in Beebug Jan/Feb 1985 (3.8) p.32, expanded on Acorn's original PHROM with commands like *SAY Y.U:N.I.V.ER.S but we don't know all the phonemes; there were also some allophone-based hardware boards)\r

1212

(syllable_separator,"",False),\r

1213

('*',primary_stress),('+',secondary_stress),\r

1214

('E',e_as_in_eat),('i',i_as_in_it),('e',e_as_in_them),\r

1215

('a',a_as_in_apple),('u',u_as_in_but),('AR',a_as_in_ah),\r

1216

('o',o_as_in_orange),('OR',close_to_or),('oo',opt_u_as_in_pull),\r

1217

('OO',oo_as_in_food),('ER',e_as_in_herd),('A',a_as_in_ate),\r

1218

('I',eye),('O',o_as_in_go),('OY',oy_as_in_toy),\r

1219

('AW',o_as_in_now),('EA',ear),('ea',a_as_in_air),\r

1220

('UR',oor_as_in_poor),('UH',a_as_in_ago),\r

1221

('P',p),('B',b),('T',t),\r

1222

('D',d),('K',k),('G',g),\r

1223

('CH',ch),('J',j_as_in_jump),('F',f),\r

1224

('V',v),('TH',th_as_in_think),('DH',th_as_in_them),\r

1225

('S',s),('Z',z),('SH',sh),\r

1226

('ZH',ge_of_blige_etc),('H',h),('M',m),\r

1227

('N',n),('NG',ng),('L',l),\r

1228

('R',r),('Y',y),('W',w),\r

1229

stress_comes_before_vowel=True,\r

1230

inline_header="*UTTER <1> ",\r

1231

clause_separator="\n*UTTER <1> ", # TODO: manual does not say what the maximum length is; longest parameter in examples is 80 bytes; should we use inline_format to make each WORD a separate command?\r

1232

cleanup_regexps=[('[*] ','*'),('[+] ','+')],\r

1233

safe_to_drop_characters=' ',\r

),\r

\r

"amiga" : makeDic(\r

'AmigaOS speech synthesizer (American English)', # shipped with the 1985 Amiga release; developed by SoftVoice Inc\r

1238

# All I had to go by for this was a screenshot on Marcos Miranda's "blog". I once saw this synth demonstrated but never tried it. My early background was the BBC Micro, not Amigas etc. But I know some people are keen on Amigas so I might as well include it.\r

1239

# (By the way I think David Hoskins had it harder than SoftVoice. Yes they were both in 1985, but the Amiga was a new 16-bit machine while the BBC was an older 8-bit one. See the "sam" format for an even older one though, although probably not written by one person.)\r

1240

(syllable_separator,'',False),\r

1241

('4',primary_stress),('3',secondary_stress),\r

1242

('/H',h),\r

1243

('EH',e_as_in_them),\r

('L',l),\r

('OW',o_as_in_go),\r

('AY',eye),\r

('AE',a_as_in_apple),\r

1248

('M',m),\r

1249

('DH',th_as_in_them),\r

1250

('IY',e_as_in_eat),\r

1251

('AH',a_as_in_ago),\r

('G',g),\r

('K',k),\r

('U',u_as_in_but),\r

('P',p),\r

('Y',y),\r

('UW',oo_as_in_food),\r

1258

('T',t),\r

1259

('ER',var1_a_as_in_ago),\r

('IH',i_as_in_it),\r

('S',s),\r

('Z',z),\r

('AW',o_as_in_now),\r

1264

('AA',a_as_in_ah),\r

1265

('R',r),\r

1266

('D',d),('F',f),('N',n),('NX',ng),('J',j_as_in_jump),\r

1267

('B',b),('V',v),('TH',th_as_in_think),\r

1268

('OH',close_to_or),('EY',a_as_in_ate),\r

1269

# The following consonants were not on the screenshot\r

1270

# (or at least I couldn't find them) so I'm guessing.\r

1271

# I think this should work given the way the other\r

1272

# consonants work in this table.\r

1273

('W',w),('CH',ch),('SH',sh),\r

1274

# The following vowels were not in the screenshot and\r

1275

# we just have to hope this guess is right - when\r

1276

# someone tries it on an Amiga and says it doesn't\r

1277

# work, maybe we can update this....\r

1278

('O',o_as_in_orange),('OY',oy_as_in_toy),\r

1279

# and these ones we can approximate to ones we already know (given that we're having to approximate British to an American voice anyway, it can't hurt TOO much more)\r

1280

(a_as_in_air,'EH',False),\r

1281

(e_as_in_herd,'ER',False),\r

1282

(ar_as_in_year,'ER',False),\r

1283

(ear,'IYAH',False), # or try IYER, or there might be a phoneme for it\r

1284

(ge_of_blige_etc,'J',False),\r

1285

(oor_as_in_poor,'OH',False),\r

1286

# lex_filename not set (I have no idea how the Amiga lexicon worked)\r

1287

safe_to_drop_characters=True, # TODO: really?\r

1288

word_separator=" ",phoneme_separator="",\r

1289

),\r

1290

"sam" : makeDic(\r

1291

'Software Automatic Mouth (1982 American English synth that ran on C64, Atari 400/800/etc and Apple II/etc)', # *might* be similar to Macintalk on the 1st Macintosh in 1984\r

1292

(syllable_separator,'',False),\r

1293

(primary_stress,'4'),\r

1294

(secondary_stress,'5'),\r

1295

('IY',e_as_in_eat),\r

1296

('IH',i_as_in_it),\r

1297

('EH',e_as_in_them),\r

1298

('AE',a_as_in_apple),\r

1299

('AA',o_as_in_orange),\r

1300

('AH',u_as_in_but),\r

1301

('AO',close_to_or),\r

1302

('OH',o_as_in_go),\r

1303

('UH',opt_u_as_in_pull),\r

1304

('UX',oo_as_in_food),\r

1305

('ER',e_as_in_herd),\r

1306

('AX',a_as_in_apple,False), # allophone?\r

1307

('IX',i_as_in_it,False), # allophone?\r

1308

('EY',a_as_in_ate),\r

1309

('AY',eye),('OY',oy_as_in_toy),\r

1310

('AW',o_as_in_now),('OW',o_as_in_go,False),\r

1311

('UW',oo_as_in_food,False), # allophone?\r

1312

('R',r),('L',l),('W',w),('WH',w,False),('Y',y),('M',m),\r

1313

('N',n),('NX',ng),('B',b),('D',d),('G',g),('Z',z),\r

1314

('J',j_as_in_jump),('ZH',ge_of_blige_etc),('V',v),\r

1315

('DH',th_as_in_them),('S',s),('SH',sh),('F',f),\r

1316

('TH',th_as_in_think),('P',p),('T',t),('K',k),\r

1317

('CH',ch),('/H',h),('Q',glottal_stop),\r

1318

approximate_missing=True,\r

1319

word_separator=" ",phoneme_separator="",\r

1320

# TODO: inline_format etc similar to bbcmicro?\r

1321

# In Atari BASIC, you set SAM$ to the phonemes and then\r

1322

# do A=USR(8192). I don't know about the C64 etc versions.\r

1323

# (max 255 phonemes per string; don't know max line len.)\r

1324

),\r

1325

\r

1326

"cheetah" : makeDic(\r

1327

'Allophone codes for the 1983 "Cheetah Sweet Talker" SP0256-based hardware add-on for ZX Spectrum and BBC Micro home computers. The conversion from phonemes to allophones might need tweaking.',\r

1328

(syllable_separator,'',False),\r

1329

("0",syllable_separator,False),\r

1330

("1",syllable_separator,False),\r

1331

("2",syllable_separator,False),\r

1332

("3",syllable_separator,False),\r

1333

("4",syllable_separator,False),\r

1334

("5",oy_as_in_toy),\r

1335

("6",eye),\r

1336

("7",e_as_in_them),\r

1337

("8",k,False),\r

1338

("9",p),\r

1339

("10",j_as_in_jump),\r

("11",n),\r

("12",i_as_in_it),\r

("13",t),\r

("14",r),\r

("15",u_as_in_but),\r

1345

("16",m),\r

1346

("17",t,False),\r

1347

("18",th_as_in_them),\r

1348

("19",e_as_in_eat),\r

1349

("20",a_as_in_ate),\r

1350

("21",d),\r

1351

("22",oo_as_in_food),\r

1352

("23",close_to_or),\r

1353

("24",o_as_in_orange),\r

1354

("25",y),\r

1355

("26",a_as_in_apple),\r

1356

("27",h),\r

1357

("28",b),\r

1358

("29",th_as_in_think),\r

1359

(opt_u_as_in_pull,"30",False),\r

1360

("30",opt_ul_as_in_pull),\r

1361

("31",oo_as_in_food,False),\r

1362

("32",o_as_in_now),\r

("33",d,False),\r

("34",g,False),\r

("35",v),\r

("36",g),\r

("37",sh),\r

("38",ge_of_blige_etc),\r

("39",r,False),\r

("40",f),\r

("41",k),\r

("42",k,False),\r

("43",z),\r

("44",ng),\r

("45",l),\r

("46",w),\r

("47",a_as_in_air),\r

1378

("49",y,False),\r

1379

("50",ch),\r

1380

("51",a_as_in_ago),\r

1381

("52",e_as_in_herd),\r

1382

(var1_a_as_in_ago,"52",False),\r

1383

("53",o_as_in_go),\r

1384

("54",th_as_in_them,False),\r

("55",s),\r

("56",n,False),\r

("57",h,False),\r

("58",var3_close_to_or),\r

1389

("59",a_as_in_ah),\r

1390

("60",ear), # or var2_ear\r

("61",g,False),\r

("62",l,False),\r

("63",b,False),\r

approximate_missing=True,\r

1395

phoneme_separator=',',safe_to_drop_characters=",",\r

1396

inline_header="DATA ",inline_footer=",0"),\r

1397

\r

1398

# END (?) PRE-32bit ERA SYNTHS (but see TODO above re SpeakJet, which is below)\r

1399

\r

1400

"speakjet" : makeDic(\r

1401

'Allophone codes for the American English "SpeakJet" speech synthesis chip (the conversion from phonemes to allophones might need tweaking). Set the SPEAKJET_SYM environment variable to use mnemonics, otherwise numbers are used (set SPEAKJET_BINARY for binary output).',\r

1402

# TODO: might want to do something similar for the older Votrax SC-02 chip, but would need to check how exactly its phoneme interface was exposed to software by the PC cards that used it (Heathkit HV-2000 etc; not sure if any are still in use though)\r

1403

(syllable_separator,'',False), # TODO: instead of having emphasis, the Speakjet has a 'faster' code for all NON-emphasized syllables\r

1404

(speakjet('IY',128),e_as_in_eat),\r

1405

(speakjet('IH',129),i_as_in_it),\r

1406

(speakjet('EY',130),a_as_in_ate),\r

1407

(speakjet('EH',131),e_as_in_them),\r

1408

(speakjet('AY',132),a_as_in_apple),\r

1409

(speakjet('AX',133),a_as_in_ago),\r

1410

(speakjet('UX',134),u_as_in_but),\r

1411

(speakjet('OH',135),o_as_in_orange),\r

1412

(speakjet('AW',136),a_as_in_ah),\r

1413

(speakjet('OW',137),o_as_in_go),\r

1414

(speakjet('UH',138),opt_u_as_in_pull),\r

1415

(speakjet('UW',139),oo_as_in_food),\r

1416

(speakjet('MM',140),m),\r

1417

(speakjet('NE',141),n,False),\r

1418

(speakjet('NO',142),n),\r

1419

(speakjet('NGE',143),ng,False),\r

1420

(speakjet('NGO',144),ng),\r

1421

(speakjet('LE',145),l,False),\r

1422

(speakjet('LO',146),l),\r

1423

(speakjet('WW',147),w),\r

1424

(speakjet('RR',148),r),\r

1425

(speakjet('IYRR',149),ear),\r

1426

(speakjet('EYRR',150),a_as_in_air),\r

1427

(speakjet('AXRR',151),e_as_in_herd),\r

1428

(speakjet('AWRR',152),a_as_in_ah,False),\r

1429

(speakjet('OWRR',153),close_to_or),\r

1430

(speakjet('EYIY',154),a_as_in_ate,False),\r

1431

(speakjet('OHIY',155),eye),\r

1432

(speakjet('OWIY',156),oy_as_in_toy),\r

1433

(speakjet('OHIH',157),eye,False),\r

1434

(speakjet('IYEH',158),y),\r

1435

(speakjet('EHLL',159),l,False),\r

1436

(speakjet('IYUW',160),oo_as_in_food,False),\r

1437

(speakjet('AXUW',161),o_as_in_now),\r

1438

(speakjet('IHUW',162),oo_as_in_food,False),\r

1439

# TODO: 163 AYWW = o_as_in_now a_as_in_ago ? handle in cleanup_regexps + cvtOut_regexps ?\r

1440

(speakjet('OWWW',164),o_as_in_go,False),\r

1441

(speakjet('JH',165),j_as_in_jump),\r

1442

(speakjet('VV',166),v),\r

1443

(speakjet('ZZ',167),z),\r

1444

(speakjet('ZH',168),ge_of_blige_etc),\r

1445

(speakjet('DH',169),th_as_in_them),\r

1446

# TODO: get cleanup_regexps to clean up some of these according to what's coming next etc:\r

1447

(speakjet('BE',170),b,False),\r

1448

(speakjet('BO',171),b),\r

1449

(speakjet('EB',172),b,False),\r

1450

(speakjet('OB',173),b,False),\r

1451

(speakjet('DE',174),d,False),\r

1452

(speakjet('DO',175),d),\r

1453

(speakjet('ED',176),d,False),\r

1454

(speakjet('OD',177),d,False),\r

1455

(speakjet('GE',178),g,False),\r

1456

(speakjet('GO',179),g),\r

1457

(speakjet('EG',180),g,False),\r

1458

(speakjet('OG',181),g,False),\r

1459

(speakjet('CH',182),ch),\r

1460

(speakjet('HE',183),h,False),\r

1461

(speakjet('HO',184),h),\r

1462

(speakjet('WH',185),w,False),\r

1463

(speakjet('FF',186),f),\r

1464

(speakjet('SE',187),s,False),\r

1465

(speakjet('SO',188),s),\r

1466

(speakjet('SH',189),sh),\r

1467

(speakjet('TH',190),th_as_in_think),\r

1468

(speakjet('TT',191),t),\r

1469

(speakjet('TU',192),t,False),\r

1470

# TODO: 193 TS in cleanup_regexps and cvtOut_regexps\r

1471

(speakjet('KE',194),k,False),\r

1472

(speakjet('KO',195),k),\r

1473

(speakjet('EK',196),k,False),\r

1474

(speakjet('OK',197),k,False),\r

1475

(speakjet('PE',198),p,False),\r

1476

(speakjet('PO',199),p),\r

1477

# lex_filename not set (I think the front-end software might have one, but don't know if it's accessible; chip itself just takes phonemes)\r

1478

approximate_missing=True,\r

1479

word_separator=ifset('SPEAKJET_BINARY',""," "),\r

1480

phoneme_separator=ifset('SPEAKJET_BINARY',""," "),\r

1481

clause_separator=ifset('SPEAKJET_BINARY',"","\n"), # TODO: is there a pause code?\r

1482

output_is_binary=ifset('SPEAKJET_BINARY',True),\r

1483

safe_to_drop_characters=True, # TODO: really?\r

1484

),\r

1485

\r

1486

"rsynth" : makeDic(\r

1487

'rsynth text-to-speech C library (American English)', # TODO: test\r

1488

(syllable_separator,'',False), # TODO: emphasis?\r

1489

("i:",e_as_in_eat),\r

1490

("I",i_as_in_it),\r

1491

("eI",a_as_in_ate),\r

1492

("E",e_as_in_them),\r

1493

("{",a_as_in_apple),\r

1494

("V",u_as_in_but),\r

1495

("Q",o_as_in_orange),\r

1496

("A:",a_as_in_ah),\r

1497

("oU",o_as_in_go),\r

1498

("U",opt_u_as_in_pull),\r

1499

("u:",oo_as_in_food),\r

("m",m),\r

("n",n),\r

("N",ng),\r

("l",l),\r

("w",w),\r

("r",r),\r

("I@",ear),\r

("e@",a_as_in_air),\r

1508

("3:",e_as_in_herd),\r

1509

("Qr",close_to_or),\r

1510

("OI",oy_as_in_toy),\r

1511

("aI",eye),\r

1512

("j",y),\r

1513

("U@",oo_as_in_food,False),\r

1514

("aU",o_as_in_now),\r

1515

("@U",o_as_in_go,False),\r

1516

("dZ",j_as_in_jump),\r

1517

("v",v),\r

1518

("z",z),\r

1519

("Z",ge_of_blige_etc),\r

1520

("D",th_as_in_them),\r

("b",b),\r

("d",d),\r

("g",g),\r

("tS",ch),\r

("h",h),\r

("f",f),\r

("s",s),\r

("S",sh),\r

("T",th_as_in_think),\r

("t",t),\r

("k",k),\r

("p",p),\r

approximate_missing=True,\r

1534

# lex_filename not set (TODO: check what sort of lexicon is used by rsynth's "say" front-end)\r

1535

safe_to_drop_characters=True, # TODO: really?\r

1536

word_separator=" ",phoneme_separator="",\r

1537

),\r

1538

\r

1539

"unicode-ipa" : makeDic(\r

1540

"IPA symbols in Unicode, as used by an increasing number of dictionary programs, websites etc",\r

1541

('.',syllable_separator,False),\r

1542

(syllable_separator,'',False),\r

1543

(u'\u02c8',primary_stress),\r

1544

(u'\u02cc',secondary_stress),\r

1545

# NB the above two are "modifier", not "combining",\r

1546

# Unicode characters. There IS a difference. If\r

1547

# your software displays them as overprinting the\r

1548

# surrounding letters, you have a bug.\r

1549

# (E.g. WeChat v1.2.2.1 on Mac OS 10.7)\r

1550

('#',text_sharp),\r

1551

('_',text_underline),\r

1552

('?',text_question),\r

1553

('!',text_exclamation),\r

1554

(',',text_comma),\r

1555

(u'\u0251',a_as_in_ah),\r

1556

(u'\u02d0',ipa_colon),\r

1557

(u'\u0251\u02d0',var3_a_as_in_ah),\r

1558

(u'\u0251\u0279',var4_a_as_in_ah),\r

1559

(u'a\u02d0',var5_a_as_in_ah),\r

1560

(u'\xe6',a_as_in_apple),\r

1561

('a',a_as_in_apple,False),\r

1562

(u'\u028c',u_as_in_but),\r

1563

('\u1d27',u_as_in_but,False), # 28c sometimes mistakenly written as 1d27\r

1564

(u'\u0252',o_as_in_orange),\r

1565

(var1_o_as_in_orange,u'\u0251',False),\r

1566

(u'\u0254',var2_o_as_in_orange),\r

1567

(u'a\u028a',o_as_in_now),\r

1568

(u'\xe6\u0254',var1_o_as_in_now),\r

1569

(u'\u0259',a_as_in_ago),\r

1570

(u'\u0259\u02d0',e_as_in_herd),\r

1571

(u'\u025a',var1_a_as_in_ago),\r

1572

(u'a\u026a',eye), (u'\u028c\u026a',eye,False),\r

1573

(u'\u0251e',var1_eye),\r

1574

('b',b),\r

1575

(u't\u0283',ch),\r

1576

(u'\u02a7',ch,False),\r

1577

('d',d),\r

1578

(u'\xf0',th_as_in_them),\r

1579

(u'\u025b',e_as_in_them),\r

1580

('e',var1_e_as_in_them),\r

1581

(u'\u025d',ar_as_in_year),\r

1582

(u'\u025c\u02d0',ar_as_in_year,False),\r

1583

(u'\u025b\u0259',a_as_in_air),\r

1584

(u'\u025b\u0279',var1_a_as_in_air),\r

1585

(u'e\u02d0',var2_a_as_in_air),\r

1586

(u'\u025b\u02d0',var3_a_as_in_air),\r

1587

(u'e\u0259',var4_a_as_in_air),\r

1588

(u'e\u026a',a_as_in_ate),\r

1589

(u'\xe6\u026a',var1_a_as_in_ate),\r

1590

('f',f),\r

1591

(u'\u0261',g), ('g',g,False),\r

1592

('h',h),\r

1593

(u'\u026a',i_as_in_it),\r

1594

(u'\u0268',var1_i_as_in_it),\r

1595

(u'\u026a\u0259',ear),\r

1596

(u'\u026a\u0279',var1_ear),\r

1597

(u'\u026a\u0279\u0259',var2_ear), # ?\r

1598

('i',e_as_in_eat),\r

1599

(u'i\u02d0',var1_e_as_in_eat),\r

1600

(u'd\u0292',j_as_in_jump),\r

1601

(u'\u02a4',j_as_in_jump,False),\r

1602

('k',k),\r

1603

('x',opt_scottish_loch),\r

1604

('l',l),\r

1605

(u'd\u026b',var1_l),\r

('m',m),\r

('n',n),\r

(u'\u014b',ng),\r

(u'\u0259\u028a',o_as_in_go),\r

1610

('o',var1_o_as_in_go),\r

1611

(u'o\u028a',var2_o_as_in_go),\r

1612

(u'\u0259\u0289',var1_u_as_in_but),\r

1613

(u'\u0254\u026a',oy_as_in_toy),\r

1614

(u'o\u026a',var1_oy_as_in_toy),\r

1615

('p',p),\r

1616

(u'\u0279',r), ('r',r,False),\r

1617

(var1_r,'r',False),\r

('s',s),\r

(u'\u0283',sh),\r

('t',t),\r

(u'\u027e',var1_t),\r

1622

(u'\u03b8',th_as_in_think),\r

1623

(u'\u028a\u0259',oor_as_in_poor),\r

1624

(u'\u028a\u0279',var1_oor_as_in_poor),\r

1625

(u'\u028a',opt_u_as_in_pull),\r

1626

(u'\u0289\u02d0',oo_as_in_food),\r

1627

(u'u\u02d0',var1_oo_as_in_food),\r

1628

('u',var2_oo_as_in_food),\r

1629

(u'\u0254\u02d0',close_to_or),\r

1630

(var1_close_to_or,u'\u0254',False),\r

1631

(u'o\u02d0',var2_close_to_or),\r

1632

('v',v),\r

1633

('w',w),\r

1634

(u'\u028d',var1_w),\r

1635

('j',y),\r

1636

('z',z),\r

1637

(u'\u0292',ge_of_blige_etc),\r

1638

(u'\u0294',glottal_stop),\r

1639

lex_filename="words-ipa.html", # write-only for now\r

1640

lex_type = "HTML",\r

1641

lex_header = '<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head><body><table>',\r

1642

lex_entry_format="<tr><td>%s</td><td>%s</td></tr>\n",\r

1643

lex_footer = "</table></body></html>\n",\r

1644

word_separator=" ",phoneme_separator="",\r

1645

stress_comes_before_vowel=True,\r

1646

safe_to_drop_characters=True, # TODO: really? (at least '-' should be safe to drop)\r

1647

cvtOut_func=unicode_preprocess,\r

1648

),\r

1649

\r

1650

"unicode-ipa-syls" : makeVariantDic(\r

1651

"Like unicode-ipa but with syllable separators preserved",\r

1652

(syllable_separator,'.'),\r

1653

cleanup_regexps=[(r"\.+",".")], # multiple . to one .\r

1654

noInherit=True),\r

1655

\r

1656

"yinghan" : makeVariantDic(\r

1657

"As unicode-ipa but, when converting a user lexicon, generates Python code that reads Wenlin Yinghan dictionary entries and adds IPA bands to matching words",\r

1658

lex_filename="yinghan-ipa.py", # write-only for now\r

1659

lex_type = "Python script",\r

1660

lex_header = r"""#!/usr/bin/env python\r

1661

# -*- coding: utf-8 -*-\r

1662

\r

1663

# Works in both Python 2 and Python 3\r

1664

\r

1665

import sys; d={""",\r

1666

lex_entry_format='u"%s":u"%s",\n',\r

1667

lex_footer = r"""}\r

1668

import re\r

1669

try: i,o=sys.stdin.buffer,sys.stdout.buffer # Python 3\r

1670

except AttributeError: i,o=sys.stdin,sys.stdout # Python 2\r

1671

for k in list(d.keys()): d[k.lower().encode('utf-8')]=d[k]\r

nextIsHead=False\r

for l in i:\r

o.write(l)\r

if nextIsHead and l.strip():\r

1676

w=l.split()\r

1677

if w[0]==u'ehw'.encode('utf-8'): l=u' '.encode('utf-8').join(w[1:])\r

1678

k = re.sub(u'\$[^)]*\$$'.encode('utf-8'),u''.encode('utf-8'),l.strip()).strip().lower() # (allow parenthesised explanation after headword when matching)\r

1679

if k in d: o.write(u'ipa '.encode('utf-8')+d[k].encode('utf-8')+u'\n'.encode('utf-8'))\r

1680

if l.startswith(u'*** '.encode('utf-8')): nextIsHead=True\r

""",\r

noInherit=True\r

),\r

\r

"unicode-rough" : makeVariantDic(\r

1686

"A non-standard notation that's reminiscent of unicode-ipa but changed so that more of the characters show in old browsers with incomplete fonts",\r

1687

("'",primary_stress),\r

1688

(',',secondary_stress),\r

1689

('ar-',a_as_in_ah),\r

1690

(':',ipa_colon),\r

1691

(var3_a_as_in_ah,'ar-',False),\r

1692

(var4_a_as_in_ah,'ar-',False),\r

1693

('uh',u_as_in_but),\r

1694

(u'\u0259:',e_as_in_herd),\r

1695

('ai',eye),\r

1696

('ch',ch),\r

1697

('e',e_as_in_them),\r

1698

('3:',ar_as_in_year),\r

1699

(a_as_in_air,'e:',False),\r

1700

(var1_a_as_in_air,'e:',False),\r

1701

(var2_a_as_in_air,'e:',False),\r

1702

(var3_a_as_in_air,'e:',False),\r

1703

(var4_a_as_in_air,'e:',False),\r

1704

(u'ei',a_as_in_ate),\r

1705

(u'\xe6i',var1_a_as_in_ate),\r

1706

('g',g),\r

1707

('i',i_as_in_it), (var1_i_as_in_it,'i',False),\r

1708

('eeuh-',ear), (var2_ear,'eeuh-',False),\r

1709

('ee',e_as_in_eat), (var1_e_as_in_eat,'ee',False),\r

1710

('j',j_as_in_jump),\r

1711

('ng',ng),\r

1712

('o',o_as_in_go),\r

1713

(var2_o_as_in_go,'o',False), # override unicode-ipa\r

1714

(var1_u_as_in_but,'o',False), # ditto (?? '+'?)\r

1715

('oy',oy_as_in_toy), (var1_oy_as_in_toy,'oy',False),\r

1716

('r',r),\r

1717

('sh',sh),\r

1718

(var1_t,'t',False),\r

1719

('th',th_as_in_think),\r

1720

('or',oor_as_in_poor),\r

1721

(var1_oor_as_in_poor,'or',False),\r

1722

('u',opt_u_as_in_pull), ('oo',oo_as_in_food),\r

1723

(var1_oo_as_in_food,'oo',False),\r

1724

(var2_oo_as_in_food,'oo',False),\r

1725

(close_to_or,'or',False),\r

1726

(var1_close_to_or,'or',False),\r

1727

(var2_close_to_or,'or',False),\r

1728

(var1_w,'w',False),\r

1729

('y',y),\r

1730

('3',ge_of_blige_etc),\r

1731

cleanup_regexps=[('-$','')],\r

cvtOut_func=None,\r

),\r

\r

"braille-ipa" : makeDic(\r

1736

"IPA symbols in Braille (2008 BANA standard). By default Braille ASCII is output; if you prefer to see the Braille dots via Unicode, set the BRAILLE_UNICODE environment variable.", # BANA = Braille Authority of North America. TODO: check if the UK accepted this standard.\r

1737

# TODO: add Unicode IPA signs that aren't used in English IPA, so we can do a general IPA conversion\r

1738

('_B',primary_stress),\r

1739

('_2',secondary_stress),\r

1740

('*',a_as_in_ah),\r

1741

('3',ipa_colon),\r

1742

('*3',var3_a_as_in_ah),\r

1743

('*#',var4_a_as_in_ah),\r

1744

('A3',var5_a_as_in_ah),\r

1745

('%',a_as_in_apple),\r

1746

('A',a_as_in_apple,False),\r

1747

('+',u_as_in_but),\r

1748

('4*',o_as_in_orange),\r

1749

(var1_o_as_in_orange,'*',False),\r

1750

('<',var2_o_as_in_orange),\r

1751

('A(',o_as_in_now),\r

1752

('%<',var1_o_as_in_now),\r

1753

('5',a_as_in_ago),\r

1754

('53',e_as_in_herd),\r

1755

('5"R.',var1_a_as_in_ago),\r

('A/',eye),\r

('*E',var1_eye),\r

('B',b),\r

('T:',ch),\r

('T":.',ch,False),\r

('D',d),\r

(']',th_as_in_them),\r

1763

('>',e_as_in_them),\r

1764

('E',var1_e_as_in_them),\r

1765

('4>3',ar_as_in_year), # (from \u025c\u02d0; TODO: check what happens to \u025d)\r

1766

('>5',a_as_in_air),\r

1767

('>#',var1_a_as_in_air),\r

1768

('E3',var2_a_as_in_air),\r

1769

('>3',var3_a_as_in_air),\r

1770

('E5',var4_a_as_in_air),\r

1771

('E/',a_as_in_ate),\r

1772

('%/',var1_a_as_in_ate),\r

('F',f),\r

('G',g),\r

('H',h),\r

('/',i_as_in_it),\r

('0I',var1_i_as_in_it),\r

1778

('/5',ear),\r

1779

('/#',var1_ear),\r

1780

('/#5',var2_ear), # ?\r

1781

('I',e_as_in_eat),\r

1782

('I3',var1_e_as_in_eat),\r

1783

('D!',j_as_in_jump),\r

1784

('K',k),\r

1785

('X',opt_scottish_loch),\r

('L',l),\r

('D6L',var1_l),\r

('M',m),\r

('N',n),\r

('$',ng),\r

('5(',o_as_in_go),\r

('O',var1_o_as_in_go),\r

1793

('O(',var2_o_as_in_go),\r

1794

('50U',var1_u_as_in_but),\r

1795

('</',oy_as_in_toy),\r

1796

('O/',var1_oy_as_in_toy),\r

1797

('P',p),\r

1798

('#',r),\r

1799

(var1_r,'R',False),\r

('S',s),\r

(':',sh),\r

('T',t),\r

('6R',var1_t),\r

('.?',th_as_in_think),\r

1805

('(5',oor_as_in_poor),\r

1806

('(#',var1_oor_as_in_poor),\r

1807

('(',opt_u_as_in_pull),\r

1808

('0U3',oo_as_in_food),\r

1809

('U3',var1_oo_as_in_food),\r

1810

('U',var2_oo_as_in_food),\r

1811

('<3',close_to_or),\r

1812

(var1_close_to_or,'<',False),\r

1813

('O3',var2_close_to_or),\r

('V',v),\r

('W',w),\r

('6W',var1_w),\r

('J',y),\r

('Z',z),\r

('!',ge_of_blige_etc),\r

1820

('2',glottal_stop),\r

1821

lex_filename=ifset("BRAILLE_UNICODE","words-ipa.txt","words-ipa.brl"), # write-only for now\r

1822

lex_type = "document",\r

1823

# inline_format=",7%s7'", # -> do this in cleanup_func so it's included in BRAILLE_UNICODE if necessary\r

1824

lex_entry_format="%s = %s\n", # ditto with the markers\r

1825

word_separator=" ",phoneme_separator="",\r

1826

stress_comes_before_vowel=True,\r

1827

safe_to_drop_characters=True, # TODO: really?\r

1828

cleanup_func=lambda r:ifset("BRAILLE_UNICODE",ascii_braille_to_unicode,lambda x:x)(",7"+r+"7'"),\r

1829

cvtOut_func=unicode_to_ascii_braille,\r

1830

),\r

1831

\r

1832

"latex-ipa" : makeDic(\r

1833

'IPA symbols for typesetting in LaTeX using the "tipa" package',\r

1834

('.',syllable_separator,False),\r

1835

('"',primary_stress),\r

1836

('\\textsecstress{}',secondary_stress),\r

1837

('\\#',text_sharp),\r

1838

('\\_',text_underline),\r

1839

('?',text_question),\r

1840

('!',text_exclamation),\r

(',',text_comma),\r

('A',a_as_in_ah),\r

(':',ipa_colon),\r

('A:',var3_a_as_in_ah),\r

1845

('A\\textturnr{}',var4_a_as_in_ah),\r

1846

('a:',var5_a_as_in_ah),\r

1847

('\\ae{}',a_as_in_apple),\r

1848

('2',u_as_in_but),\r

1849

('6',o_as_in_orange),\r

1850

(var1_o_as_in_orange,'A',False),\r

1851

('O',var2_o_as_in_orange),\r

1852

('aU',o_as_in_now),\r

1853

('\\ae{}O',var1_o_as_in_now),\r

1854

('@',a_as_in_ago),\r

1855

('@:',e_as_in_herd),\r

1856

('\\textrhookschwa{}',var1_a_as_in_ago),\r

('aI',eye),\r

('Ae',var1_eye),\r

('b',b),\r

('tS',ch),\r

('d',d),\r

('D',th_as_in_them),\r

1863

('E',e_as_in_them),\r

1864

('e',var1_e_as_in_them),\r

1865

('3:',ar_as_in_year),\r

1866

('E@',a_as_in_air),\r

1867

('E\\textturnr{}',var1_a_as_in_air),\r

1868

('e:',var2_a_as_in_air),\r

1869

('E:',var3_a_as_in_air),\r

1870

('e@',var4_a_as_in_air),\r

1871

('eI',a_as_in_ate),\r

1872

('\\ae{}I',var1_a_as_in_ate),\r

('f',f),\r

('g',g),\r

('h',h),\r

('I',i_as_in_it),\r

('1',var1_i_as_in_it),\r

1878

('I@',ear),\r

1879

('I\\textturnr{}',var1_ear),\r

1880

('I@\\textturnr{}',var2_ear), # ?\r

1881

('i',e_as_in_eat),\r

1882

('i:',var1_e_as_in_eat),\r

1883

('dZ',j_as_in_jump),\r

1884

('k',k),\r

1885

('x',opt_scottish_loch),\r

1886

('l',l),\r

1887

('d\\textltilde{}',var1_l),\r

('m',m),\r

('n',n),\r

('N',ng),\r

('@U',o_as_in_go),\r

('o',var1_o_as_in_go),\r

1893

('oU',var2_o_as_in_go),\r

1894

('@0',var1_u_as_in_but),\r

1895

('OI',oy_as_in_toy),\r

1896

('oI',var1_oy_as_in_toy),\r

1897

('p',p),\r

1898

('\\textturnr{}',r),\r

1899

(var1_r,'r',False),\r

('s',s),\r

('S',sh),\r

('t',t),\r

('R',var1_t),\r

('T',th_as_in_think),\r

1905

('U@',oor_as_in_poor),\r

1906

('U\\textturnr{}',var1_oor_as_in_poor),\r

1907

('U',opt_u_as_in_pull),\r

1908

('0:',oo_as_in_food),\r

1909

('u:',var1_oo_as_in_food),\r

1910

('u',var2_oo_as_in_food),\r

1911

('O:',close_to_or),\r

1912

(var1_close_to_or,'O',False),\r

1913

('o:',var2_close_to_or),\r

1914

('v',v),\r

1915

('w',w),\r

1916

('\\textturnw{}',var1_w),\r

1917

('j',y),\r

1918

('z',z),\r

1919

('Z',ge_of_blige_etc),\r

1920

('P',glottal_stop),\r

1921

lex_filename="words-ipa.tex", # write-only for now\r

1922

lex_type = "document",\r

1923

lex_header = r'\documentclass[12pt,a4paper]{article} \usepackage[safe]{tipa} \usepackage{longtable} \begin{document} \begin{longtable}{ll}',\r

1924

lex_entry_format=r"%s & \textipa{%s}\\"+"\n",\r

1925

lex_footer = r"\end{longtable}\end{document}"+"\n",\r

1926

inline_format = "\\textipa{%s}",\r

1927

inline_oneoff_header = r"% In preamble, put \usepackage[safe]{tipa}"+"\n", # (the [safe] part is recommended if you're mixing with other TeX)\r

1928

word_separator=" ",phoneme_separator="",\r

1929

clause_separator=r"\\"+"\n",\r

1930

stress_comes_before_vowel=True,\r

1931

safe_to_drop_characters=True, # TODO: really?\r

1932

),\r

1933

\r

1934

"pinyin-approx" : makeDic(\r

1935

"Rough approximation using roughly the spelling rules of Chinese Pinyin (for getting Chinese-only voices to speak some English words; works with some words better than others)", # write-only for now\r

1936

('4',primary_stress),\r

1937

('2',secondary_stress),\r

1938

('a5',a_as_in_ah),\r

1939

('ya5',a_as_in_apple),\r

1940

('e5',u_as_in_but),\r

1941

('yo5',o_as_in_orange),\r

1942

('ao5',o_as_in_now),\r

1943

(e_as_in_herd,'e5',False),\r

('ai5',eye),\r

('bu0',b),\r

('che0',ch),\r

('de0',d),\r

('ze0',th_as_in_them),\r

1949

('ye5',e_as_in_them),\r

1950

(a_as_in_air,'ye5',False),\r

1951

('ei5',a_as_in_ate),\r

('fu0',f),\r

('ge0',g),\r

('he0',h),\r

('yi5',i_as_in_it),\r

1956

('yi3re5',ear),\r

1957

(e_as_in_eat,'yi5',False),\r

1958

('zhe0',j_as_in_jump),\r

('ke0',k),\r

('le0',l),\r

('me0',m),\r

('ne0',n),\r

('eng0',ng),\r

('ou5',o_as_in_go),\r

1965

('ruo2yi5',oy_as_in_toy),\r

('pu0',p),\r

('re0',r),\r

('se0',s),\r

('she0',sh),\r

('te0',t),\r

(th_as_in_think,'zhe0',False),\r

1972

(oor_as_in_poor,'wu5',False),\r

1973

('yu5',oo_as_in_food),\r

1974

('huo5',close_to_or),\r

(v,'fu0',False),\r

('wu0',w),\r

('yu0',y),\r

(z,'ze0',False),\r

(ge_of_blige_etc,'zhe0',False),\r

1980

approximate_missing=True,\r

1981

lex_filename="words-pinyin-approx.txt", # write-only for now\r

1982

lex_type = "text",\r

1983

lex_header = "Pinyin approxmations (very approximate!)\n----------------------------------------\n",\r

1984

lex_entry_format = "%s ~= %s\n",\r

1985

word_separator=" ",phoneme_separator="",\r

1986

cleanup_regexps=[\r

1987

("te0ye","tie"),\r

1988

("e0e5","e5"),("([^aeiou][uo])0e(5)",r"\1\2"),\r

1989

("yu0y","y"),\r

1990

("wu0yo5","wo5"),\r

1991

("([bdfghklmnpwz])[euo]0ei",r"\1ei"),\r

1992

("([bdghklmnpstwz])[euo]0ai",r"\1ai"),\r

1993

("([ghklmnpstyz])[euo]0ya",r"\1a"),("([ghklmnpstz])a([0-5]*)ne0",r"\1an\2"),\r

1994

("([bdfghklmnpstwyz])[euo]0a([1-5])",r"\1a\2"),\r

1995

("([bdjlmnpt])[euo]0yi",r"\1i"),("([bjlmnp])i([1-5]*)ne0",r"\1in\2"),\r

1996

("([zs])he0ei",r"\1hei"),\r

1997

("([dfghklmnprstyz])[euo]0ou",r"\1ou"),\r

1998

("([dghklnrst])[euo]0huo",r"\1uo"),\r

1999

("([bfpm])[euo]0huo",r"\1o"),\r

2000

("([bdghklmnprstyz])[euo]0ao",r"\1ao"),\r

2001

("([zcs])h[eu]0ao",r"\1hao"),\r

2002

("re0r","r"),\r

2003

("zhe0ne0","zhun5"),\r

2004

("54","4"),\r

2005

("52","2"),\r

2006

("([bdjlmnpty])i([1-9])eng0",r"\1ing\2"),\r

2007

("ya([1-9])eng0",r"yang\1"),\r

2008

("ya([1-9])ne0",r"an\1"),\r

2009

("ye([1-9])ne0",r"yan\1"),("([wr])[eu]0yan",r"\1en"),\r

2010

("yi([1-9])ne0",r"yin\1"),\r

2011

\r

2012

("yu0","yu5"),("eng0","eng5"), # they won't work unvoiced anyway\r

2013

("0","5"), # comment out if the synth supports 'tone 0 for unvoiced'\r

2014

#("[euo]0","0"), # comment in if it expects consonants only when doing that\r

],\r

),\r

\r

"kana-approx" : makeDic(\r

2019

"Rough approximation using kana (for getting Japanese computer voices to speak some English words; works with some words better than others). Set KANA_TYPE environment variable to hiragana or katakana (which can affect the sounds of some voices); default is hiragana", # for example on Mac OS 10.7+ (with Japanese voice installed in System Preferences) try PHONES_PIPE_COMMAND='say -v Kyoko' (this voice has a built-in converter from English as well, but lexconvert --phones kana-approx can work better with some complex words, although the built-in converter does seem to have access to slightly more phonemes and can therefore produce words like "to" better). Default is hiragana because I find hiragana easier to read than katakana, although the Kyoko voice does seem to be able to say 'v' a little better when using kata. Mac OS 10.7+'s Korean voices (Yuna and Narae) can also read kana, and you could try doing a makeVariantDic and adding in some Korean jamo letters for them (you'd be pushed to represent everything in jamo but kana+jamo seems more hopeful in theory), but again some words work better than others (not all phonetic combinations are supported and some words aren't clear at all).\r

2020

# This kana-approx format is 'write-only' for now (see comment in cleanup_regexps re possible reversal)\r

2021

(u'\u30fc',primary_stress),\r

2022

(secondary_stress,ifset('KANA_MORE_EMPH',u'\u30fc'),False), # set KANA_MORE_EMPH environment variable if you want to try doubling the secondary-stressed vowels as well (doesn't always work very well; if it did, I'd put this line in a makeVariantDic called kana-approx-moreEmph or something)\r

2023

# The following Unicode codepoints are hiragana; KANA_TYPE is handled by cleanup_func below\r

2024

(u'\u3042',a_as_in_apple),\r

2025

(u'\u3044',e_as_in_eat),\r

2026

(u'\u3046',oo_as_in_food),\r

2027

(u'\u3048',e_as_in_them),\r

2028

(u'\u304a',o_as_in_orange),\r

2029

(u'\u3042\u3044',eye), # ai\r

2030

(u'\u3042\u304a',o_as_in_now), # ao\r

2031

(u'\u3048\u3044',a_as_in_ate), # ei\r

2032

(u'\u304a\u3044',oy_as_in_toy), # oi\r

2033

(u'\u304a\u3046',o_as_in_go), # ou\r

2034

(a_as_in_ah,u'\u3042',False),\r

2035

(a_as_in_ago,u'\u3046\u304a',False), # TODO: \u3042, \u304a or \u3046 depending on the word?\r

2036

(e_as_in_herd,u'\u3042',False), # TODO: really?\r

2037

(i_as_in_it,u'\u3044',False), # TODO: really?\r

2038

(u_as_in_but,u'\u3046',False), # TODO: really?\r

2039

(ar_as_in_year,u'\u3048',False), # TODO: really?\r

2040

(ear,u'\u3044\u304a',False), # TODO: really?\r

2041

(a_as_in_air,u'\u3048',False), # TODO: really?\r

2042

(oor_as_in_poor,u'\u304a',False), # TODO: really?\r

2043

(close_to_or,u'\u304a\u30fc'), # TODO: really?\r

2044

(u'\u3076',b), # bu (with vowel replacements later)\r

2045

(u'\u3061\u3047',ch), # chu (ditto)\r

2046

(u'\u3065',d), # du (and so on)\r

2047

(u'\u3066\u3085',th_as_in_think), (th_as_in_them,u'\u3066\u3085',False),\r

2048

(u'\u3075',f),\r

2049

(u'\u3050',g),\r

2050

(u'\u306f',h), # ha (as hu == fu)\r

2051

(u'\u3058\u3085',j_as_in_jump), (ge_of_blige_etc,u'\u3058\u3085',False),\r

2052

(u'\u304f',k),\r

2053

(u'\u308b',l), (r,u'\u308b',False),\r

2054

(u'\u3080',m),\r

2055

(u'\u306c',n),\r

2056

(u'\u3093\u3050',ng),\r

2057

(u'\u3077',p),\r

2058

(u'\u3059',s),\r

2059

(u'\u3057\u3085',sh),\r

2060

(u'\u3064',t),\r

2061

(u'\u308f',w), # use 'wa' (as 'wu' == 'u')\r

2062

(v,ifset('KANA_V_AS_W',u'\u308f',u'\u3094'),False), # TODO: document KANA_V_AS_W variable. Is vu always supported? (it doesn't seem to show up in all fonts)\r

2063

(u'\u3086',y),\r

2064

(u'\u305a',z),\r

2065

lex_filename="words-kana-approx.txt",\r

2066

lex_type = "text",\r

2067

lex_header = "Kana approxmations (very approximate!)\n--------------------------------------\n",\r

2068

lex_entry_format = "%s ~= %s\n",\r

2069

word_separator=" ",phoneme_separator="",\r

2070

clause_separator=u"\u3002\n".encode('utf-8'),\r

2071

cleanup_regexps=[(u"\u306c$",u"\u3093\u30fc"), # TODO: or u"\u3093\u3093" ?\r

2072

# now the vowel replacements (bu+a -> ba, etc) (in most cases these can be reversed into cvtOut_regexps if you want to use the kana-approx table to convert hiragana into approximate English phonemes (plus add a (u"\u3093\u30fc*",u"\u306c") and perhaps de-doubling rules to convert back to emphasis) but the result is unlikely to be any good)\r

2073

(u"\u3076\u3042",u"\u3070"),(u"\u3076\u3044",u"\u3073"),(u"\u3076\u3048",u"\u3079"),(u"\u3076\u304a",u"\u307c"),(u"\u3076\u3046",u"\u3076"),\r

2074

(u"\u3061\u3085\u3042",u"\u3061\u3083"),(u"\u3061\u3085\u3046",u"\u3061\u3085"),(u"\u3061\u3085\u3048",u"\u3061\u3047"),(u"\u3061\u3085\u304a",u"\u3061\u3087"),(u"\u3061\u3085\u3044",u"\u3061"),\r

2075

(u"\u3065\u3042",u"\u3060"),(u"\u3065\u3044",u"\u3062"),(u"\u3065\u3048",u"\u3067"),(u"\u3065\u304a",u"\u3069"),(u"\u3065\u3046",u"\u3065"),\r

2076

(u"\u3066\u3085\u3042",u"\u3066\u3083"),(u"\u3066\u3085\u3044",u"\u3066\u3043"),(u"\u3066\u3043\u3046",u"\u3066\u3085"),(u"\u3066\u3085\u3048",u"\u3066\u3047"),(u"\u3066\u3085\u304a",u"\u3066\u3087"),\r

2077

(u"\u3075\u3042",u"\u3075\u3041"),(u"\u3075\u3044",u"\u3075\u3043"),(u"\u3075\u3048",u"\u3075\u3047"),(u"\u3075\u304a",u"\u3075\u3049"),(u"\u3075\u3046",u"\u3075"),\r

2078

(u"\u306f\u3044",u"\u3072"),(u"\u306f\u3046",u"\u3075"),(u"\u306f\u3048",u"\u3078"),(u"\u306f\u304a",u"\u307b"),(u"\u306f\u3042",u"\u306f"),\r

2079

(u"\u3050\u3042",u"\u304c"),(u"\u3050\u3044",u"\u304e"),(u"\u3050\u3048",u"\u3052"),(u"\u3050\u304a",u"\u3054"),(u"\u3050\u3046",u"\u3050"),\r

2080

(u"\u3058\u3085\u3042",u"\u3058\u3083"),(u"\u3058\u3085\u3046",u"\u3058\u3085"),(u"\u3058\u3085\u3048",u"\u3058\u3047"),(u"\u3058\u3085\u304a",u"\u3058\u3087"),(u"\u3058\u3085\u304a",u"\u3058"),\r

2081

(u"\u304f\u3042",u"\u304b"),(u"\u304f\u3044",u"\u304d"),(u"\u304f\u3048",u"\u3051"),(u"\u304f\u304a",u"\u3053"),(u"\u304f\u3046",u"\u304f"),\r

2082

(u"\u308b\u3042",u"\u3089"),(u"\u308b\u3044",u"\u308a"),(u"\u308b\u3048",u"\u308c"),(u"\u308b\u304a",u"\u308d"),(u"\u308b\u3046",u"\u308b"),\r

2083

(u"\u3080\u3042",u"\u307e"),(u"\u3080\u3044",u"\u307f"),(u"\u3080\u3048",u"\u3081"),(u"\u3080\u304a",u"\u3082"),(u"\u3080\u3046",u"\u3080"),\r

2084

(u"\u306c\u3042",u"\u306a"),(u"\u306c\u3044",u"\u306b"),(u"\u306c\u3048",u"\u306d"),(u"\u306c\u304a",u"\u306e"),(u"\u306c\u3046",u"\u306c"),\r

2085

(u"\u3077\u3042",u"\u3071"),(u"\u3077\u3044",u"\u3074"),(u"\u3077\u3048",u"\u307a"),(u"\u3077\u304a",u"\u307d"),(u"\u3077\u3046",u"\u3077"),\r

2086

(u"\u3059\u3042",u"\u3055"),(u"\u3059\u3048",u"\u305b"),(u"\u3059\u304a",u"\u305d"),(u"\u3059\u3046",u"\u3059"),\r

2087

(u"\u3057\u3085\u3042",u"\u3057\u3083"),(u"\u3057\u3085\u3046",u"\u3057\u3085"),(u"\u3057\u3085\u3048",u"\u3057\u3047"),(u"\u3057\u3085\u304a",u"\u3057\u3087"),(u"\u3057\u3085\u3044",u"\u3057"),\r

2088

(u"\u3064\u3042",u"\u305f"),(u"\u3064\u3044",u"\u3061"),(u"\u3064\u3048",u"\u3066"),(u"\u3064\u304a",u"\u3068"),(u"\u3064\u3046",u"\u3064"),\r

2089

(u"\u3086\u3042",u"\u3084"),(u"\u3086\u3048",u"\u3044\u3047"),(u"\u3086\u304a",u"\u3088"),(u"\u3086\u3046",u"\u3086"),\r

2090

(u"\u305a\u3042",u"\u3056"),(u"\u305a\u3044",u"\u3058"),(u"\u305a\u3048",u"\u305c"),(u"\u305a\u304a",u"\u305e"),(u"\u305a\u3046",u"\u305a"),\r

2091

(u"\u308f\u3044",u"\u3046\u3043"),(u"\u308f\u3046",u"\u3046"),(u"\u308f\u3048",u"\u3046\u3047"),(u"\u308f\u304a",u"\u3092"),(u"\u308f\u3042",u"\u308f"),\r

2092

(u'\u3046\u3043\u3066\u3085', u'\u3046\u3043\u3065'), # sounds a bit better for words like 'with'\r

2093

(u'\u3085\u3046',u'\u3085'), # and 'the' (especially with a_as_in_ago mapping to u'\u3046\u304a'; it's hard to get a convincing 'the' though, especially in isolation)\r

2094

(u'\u3050\u3050',u'\u3050'), # gugu -> gu, sometimes comes up with 'gl-' combinations\r

2095

(u'\u30fc\u30fc+',u'\u30fc'), # in case we put 30fc in the table AND a stress mark has been applied to it\r

2096

(u'^(.)$',u'\\1\u30fc'), # lengthen any word that ends up as a single kana (otherwise can be clipped badly)\r

2097

(u'^([\u3042\u3070\u3060\u304c\u304b\u3089\u307e\u306a\u3071\u3055\u305f\u3084\u3056\u308f]\u3044)$',u'\\1\u30fc'), # ditto for -ai (TODO: -ao might need lengthening sometimes?? depends on context. -ei, -oi, -ou seem OK)\r

2098

],\r

2099

cleanup_func = hiragana_to_katakana\r

2100

),\r

2101

\r

2102

"deva-approx" : makeDic(\r

2103

"Rough approximation using Devanagari (for getting Indian computer voices to speak some English words; works with some words better than others); can also be used to approximate Devanagari words in English phonemes",\r

2104

(u'\u02c8',primary_stress),\r

2105

(u'\u093e',a_as_in_ah),(u'\u0906',a_as_in_ah,False),\r

2106

(u'\u0905',u_as_in_but),\r

2107

(u'\u092c',b),\r

2108

(u'\u091b',ch),(u'\u091a',ch,False),\r

2109

(u'\u0926',d),(u'\u0921',d,False), # TODO: check which sounds better for reading English words\r

2110

(u'\u0920',th_as_in_them), # (very approximate)\r

2111

(u'\u0948',e_as_in_them),(u'\u0910',e_as_in_them,False),\r

2112

(u'\u0947',a_as_in_ate),(u'\u090f',a_as_in_ate,False),\r

2113

(u'\u092b\u093c',f),\r

2114

(u'\u0917',g),\r

2115

(u'\u0917\u093c',g,False), # (Hindi; differs in others)\r

2116

(u'\u0939',h),(u'\u0903',h,False),\r

2117

(u'\u093f',i_as_in_it),(u'\u0907',i_as_in_it,False),\r

2118

(u'\u0940',e_as_in_eat),(u'\u0908',e_as_in_eat,False),\r

2119

(u'\u091c',j_as_in_jump),\r

2120

(u'\u0915',k),(u'\u0916',k,False),\r

2121

(u'\u0916\u093c',opt_scottish_loch),\r

2122

(u'\u0915\u093c',opt_scottish_loch,False), # ?\r

2123

(u'\u0932',l),\r

2124

(u'\u092e',m),\r

2125

(u'\u0928',n),(u'\u0923',n,False),\r

2126

(u'\u0902',ng),\r

2127

(u'\u092a',p),\r

2128

(u'\u092b',f,False), # (Hindi; p in some others?)\r

2129

(u'\u0930',r),(u'\u0921\u093c',r,False),\r

2130

(u'\u0938',s),\r

2131

(u'\u0936',sh), (u'\u0937',sh,False),\r

2132

(u'\u091f',t),(u'\u0924',t,False),(u'\u0925',t,False),\r

2133

(u'\u0941',opt_u_as_in_pull),(u'\u0909',opt_u_as_in_pull,False),\r

2134

(u'\u0942',oo_as_in_food),(u'\u090a',oo_as_in_food,False),\r

2135

(u'\u094c',close_to_or),(u'\u0914',close_to_or,False),\r

2136

(u'\u094b',opt_ol_as_in_gold),(u'\u0913',opt_ol_as_in_gold,False),\r

2137

(u'\u0935',v),(w,u'\u0935',False),\r

2138

(u'\u092f',y),\r

2139

(u'\u091c\u093c',z),\r

2140

(u'\u091d\u093c',ge_of_blige_etc),\r

2141

(u'\u0901',ipa_colon),\r

2142

word_separator=" ",phoneme_separator="",\r

2143

stress_comes_before_vowel=True,\r

2144

safe_to_drop_characters=True, # it's an approximation\r

2145

approximate_missing=True,\r

2146

cleanup_regexps=[\r

2147

# add virama if consonant not followed by vowel, and delete default vowel after consonant:\r

2148

(u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u094d'),(u'(?<=[\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0905',u''),(u'(.)\u094d\u02c8',u'\u02c8\\1'),\r

2149

# replace vowel signs with vowel letters if not preceded by consonants:\r

2150

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093e',u'\u0906'),\r

2151

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093f',u'\u0907'),\r

2152

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0940',u'\u0908'),\r

2153

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0941',u'\u0909'),\r

2154

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0942',u'\u090a'),\r

2155

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0947',u'\u090f'),\r

2156

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0948',u'\u0910'),\r

2157

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094b',u'\u0913'),\r

2158

(u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094c',u'\u0914')],\r

2159

cvtOut_func=unicode_preprocess,\r

2160

cvtOut_regexps=[\r

2161

# add default vowel when necessary:\r

2162

(u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u094d\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u0905'),(u'\u094d',u''),\r

2163

# 'add h' approximations:\r

2164

(u'\u092d',u'\u092c\u0939'),(u'\u0927',u'\u0922\u0939'),(u'\u0918',u'\u0917\u0939'),(u'\u091d',u'\u091c\u0939'),(u'\u0922\u093c',u'\u0921\u093c\u0939'),\r

]),\r

\r

"names" : makeDic(\r

"Lexconvert internal phoneme names (sometimes useful with the --phones option while developing new formats)",\r

2169

*[(phName,phVal) for phName,phVal in phonemes.items()])}\r

2170

\r

2171

# The mainopt_...() functions are the main options\r

2172

# (if you implement a new one, main() will detect it);\r

2173

# 1st line of doc string should be parameter summary\r

2174

# (start the doc string with \n if no parameters); if 1st\r

2175

# character of doc string is * then this function is put\r

2176

# among the first in the help (otherwise alphabetically).\r

2177

# If function returns a string, that's taken to be a\r

2178

# message to be printed with error exit. Same if it raises\r

2179

# an exception of type Message.\r

2180

\r

2181

def mainopt_try(i):\r

2182

"""*<format> [<pronunciation>]\r

2183

Convert input from <format> into eSpeak and try it out.\r

2184

(Requires the 'espeak' command.)\r

2185

E.g.: python lexconvert.py --try festival h @0 l ou1\r

2186

or: python lexconvert.py --try unicode-ipa '\\u02c8\\u0279\\u026adn\\u0329' (for Unicode put '\\uNNNN' or UTF-8)"""\r

2187

format = sys.argv[i+1]\r

2188

if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"\r

2189

for phones in getInputText(i+2,"phonemes in "+format+" format",'maybe'):\r

2190

espeak = convert(phones,format,'espeak')\r

2191

w = os.popen("espeak -x","w")\r

2192

getBuf(w).write(markup_inline_word("espeak",espeak)+as_utf8('\n')) # separate process each item for more responsiveness from the console (sending 'maybe' to getInputText means won't lose efficiency if not read from console)\r

2193

\r

2194

def mainopt_trymac(i):\r

2195

"""*<format> [<pronunciation>]\r

2196

Convert phonemes from <format> into Mac and try it using the Mac OS 'say' command"""\r

2197

format = sys.argv[i+1]\r

2198

if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"\r

2199

for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):\r

2200

mac = convert(resp,format,'mac')\r

2201

toSay = markup_inline_word("mac",mac)\r

2202

print(as_printable(toSay))\r

2203

w = os.popen(macSayCommand()+" -v Vicki","w")\r

2204

getBuf(w).write(toSay) # Need to specify a voice because the default voice might not be able to take Apple phonemes. Vicki has been available since 10.3, as has the 'say' command (previous versions need osascript, see Gradint's code)\r

2205

\r

2206

def mainopt_trymac_uk(i):\r

2207

"""*<format> [<pronunciation>]\r

2208

Convert phonemes from <format> and try it with Mac OS British voices (see --mac-uk for details)"""\r

2209

assert sys.version_info[0]==2, "--trymac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"\r

2210

format = sys.argv[i+1]\r

2211

if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"\r

2212

for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):\r

2213

macuk = convert(resp,format,'mac-uk')\r

2214

m = MacBritish_System_Lexicon("",os.environ.get("MACUK_VOICE","Daniel"))\r

2215

try:\r

2216

try: m.speakPhones(macuk.split())\r

2217

finally: m.close()\r

2218

except KeyboardInterrupt:\r

2219

sys.stderr.write("Interrupted\n")\r

2220

\r

2221

def mainopt_phones(i):\r

2222

"""*<format> [<words>]\r

2223

Use eSpeak to convert text to phonemes, and then convert the phonemes to format 'format'.\r

2224

E.g.: python lexconvert.py --phones unicode-ipa This is a test sentence.\r

2225

Set environment variable PHONES_PIPE_COMMAND to an additional command to which to write the phones as well as standard output. (If standard input is a terminal then this will be done separately after each line.)\r

2226

(Some commercial speech synthesizers do not work well when driven entirely from phonemes, because their internal format is different and is optimised for normal text.)\r

2227

Set format to 'all' if you want to see the phonemes in ALL supported formats."""\r

2228

format = sys.argv[i+1]\r

2229

if format=="example": return "The 'example' format cannot be used with --phones; try --convert, or did you mean --phones festival" # could allow example anyway as it's basically Festival, but save confusion as eSpeak might not generate the same phonemes if our example words haven't been installed in the system's eSpeak. (Still allow it to be used in --try etc though.)\r

2230

if not format in lexFormats and not format=="all": return "No such format "+repr(format)+" (use --formats to see a list of formats)"\r

2231

hadOneoff = False\r

2232

for response in getInputText(i+2,"text",'maybe'):\r

2233

response = pipeThroughEspeak(as_utf8(response).replace(u'\u2032'.encode('utf-8'),as_utf8('')).replace(u'\u00b4'.encode('utf-8'),as_utf8('')).replace(u'\u02b9'.encode('utf-8'),as_utf8('')).replace(u'\u00b7'.encode('utf-8'),as_utf8(''))) # (remove any 2032 and b7 pronunciation marks before passing to eSpeak)\r

2234

if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'\r

2235

if format=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")\r

2236

else: formats = [format]\r

2237

for format in formats:\r

2238

def out(doOneoff=True):\r

2239

if len(formats)>1: writeFormatHeader(format)\r

2240

if doOneoff: getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_oneoff_header")))\r

2241

getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_header")))\r

2242

output_clauses(format,convert(parseIntoWordsAndClauses("espeak",response),"espeak",format))\r

2243

getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_footer")))\r

2244

print("")\r

2245

sys.stdout.flush() # in case it's being piped\r

2246

out(not hadOneoff) ; hadOneoff = True\r

2247

if os.environ.get("PHONES_PIPE_COMMAND",""):\r

2248

o,sys.stdout = sys.stdout,os.popen(os.environ["PHONES_PIPE_COMMAND"],'w')\r

out()\r

sys.stdout = o\r

\r

def mainopt_ruby(i):\r

2253

"""*<format> [<words>]\r

2254

Like --phones but outputs the result as HTML RUBY markup, with each word's pronunciation symbols placed above the corresponding English word.\r

2255

E.g.: python lexconvert.py --ruby unicode-ipa This is a test sentence.\r

2256

This option is made more complicated by the fact that different versions of eSpeak may space the phoneme output differently, for example when handling numbers; if your eSpeak version is not recognised then all numbers are unannotated. Anyway you are advised not to rely on this option working with the new development NG versions of eSpeak. If the version you have behaves unexpectedly, words and phonemes output might lose synchronisation. However this option is believed to be stable when used with simple text and the original eSpeak.\r

2257

You can optionally set the RUBY_GRADINT_CGI environment variable to the URL of an instance of Gradint Web Edition to generate audio links for each word. If doing this in a Web Adjuster filter, see comments in the lexconvert source for setup details."""\r

2258

# htmlFilter with --htmlText of course. Set separator to two newlines and copy the generated 'h5a' function (from a manual run or the lexconvert source) into Adjuster's headAppend option (but don't expect HTML5 audio to work from Adjuster's submitBookmarklet option; pronunciation links will take you off the page if it doesn't).\r

2259

# Use double newlines as single newlines are used in the h5a script; adding that script via bookmarklet doesn't always run it\r

2260

format = sys.argv[i+1]\r

2261

if format=="example": return "The 'example' format cannot be used with --ruby; did you mean festival?" # as above\r

2262

elif format=="all": return "The --phones all option cannot be used with --ruby" # (well you could implement it if you want but the resulting ruby would be quite unwieldy)\r

2263

if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"\r

2264

text = as_utf8(getInputText(i+2,"text")).replace(u'\u2019'.encode('utf-8'),as_utf8("'")).replace(u'\u2032'.encode('utf-8'),as_utf8("'")).replace(u'\u00b4'.encode('utf-8'),as_utf8("'")).replace(u'\u02b9'.encode('utf-8'),as_utf8("'")).replace(u'\u00b7'.encode('utf-8'),as_utf8('')).replace(u'\u00a0'.encode('utf-8'),as_utf8(' '))\r

2265

# eSpeak's basic idea of an alphabetical word (most versions?) -\r

2266

wordRegexps = [r"(?:[A-Z]+['?-])*(?:(?:(?<![A-z.])(?:[A-z]\.)+[A-z](?![A-z.]))|[A-Z]+[a-z](?![A-z])|[A-Z][A-Z]+(?![a-z][A-z])|[A-Z]?(?:[a-z]['?-]?)+|[A-Z])"]\r

2267

# A dot, when not part of an elipses, followed by a letter is pronounced "dot", and two of them are pronounced "dot dot":\r

2268

wordRegexps.append(r"(?<!\.\.)\.(?=[A-z])|(?<!\.)\.(?=\.[A-z])")\r

2269

# ! followed by a letter is pronounced "exclamation", and .! is "dotexclamation"; @ symbols similarly; copyright\r

2270

atEtc = u"(?:[@!:]|\u00a9)*".encode('utf-8')\r

2271

wordRegexps.append(as_utf8(r"\.?[!@]+(?=[A-z])|(?<![A-z])@")+atEtc+as_utf8("(?![A-z])|")+unichr(0xa9).encode('utf-8')+atEtc)\r

2272

# : between numbers if NOT followed by 2 digits:\r

2273

wordRegexps.append(r"(?<![A-z]):(?![A-z]|[0-9][0-9])")\r

2274

# - between numbers\r

2275

wordRegexps.append(r"(?<=[0-9])-(?=[0-9])")\r

2276

# TODO: if you paste in (e.g.) CJK characters, eSpeak will say "symbol-symbol-symbol" etc, but this is not accounted for by the above regexp so it'll go onto following words.\r

2277

vLine = espeak_version_line()\r

2278

if "1.45." in vLine:\r

2279

# This seems to work in eSpeak 1.45:\r

2280

# (TODO: test leading 0s & leading decimal)\r

2281

# a number of 4 digits or less (with any number of digits after the decimal point) is grouped as 1 word:\r

2282

wordRegexps.append(r"(?<![0-9])[0-9]{1,4}(?:\.[0-9]+)?(?!,?[0-9])")\r

2283

# and a number of 1 to 3 digits with any number of 000 or ,000 groups, with optional decimal point followed by any number of digits, OR when placed before an integer number of 3-digit groups, is grouped as 1 word:\r

2284

wordRegexps.append(r"[0-9]{1,3}(?:,?000)*(?:\.[0-9]+)?,?(?=(?:,?[0-9]{3,3})*,?(?:[^0-9]|$))")\r

2285

text2 = text\r

2286

elif "1.48." in vLine:\r

2287

# In eSpeak 1.48 the groups are smaller.\r

2288

# Decimal point and everything after it = individual\r

2289

wordRegexps.append(r"(?<=[0-9])\.(?=[0-9])")\r

2290

for places in range(25): # TODO: really want unbounded, but (?<=...) is fixed-length\r

2291

wordRegexps.append(r"(?<=[0-9]\."+"[0-9]"*places+r")[0-9]")\r

2292

# Number with a leading dot grouped as 1 word:\r

2293

wordRegexps.append(r"(?<![0-9])\.[0-9]+")\r

2294

# TODO: leading 0s (0000048 goes to 0 000 048)\r

2295

# For normal numbers:\r

2296

# A null string w. 3 or 6 digits to go and digits b4 shld match for 'thousand', 'million' (unless 3+ digits are leading 0s, or fewer than 3 leading 0s and whole thing begins with a 0, or it's part of a decimal expansion, in which case different rules apply, but (?<=...) must be fixed-length, so we need another one of these awful loops) :\r

2297

for prevDigits in range(10):\r

2298

for beforeThat in ["^",r"[^.0-9,]"]: # beginning of string, or something OTHER than a decimal point / num\r

2299

wordRegexps.append(r"(?<="+beforeThat+"[1-9]"+"[0-9,]"*prevDigits+r")(?<!,)(?<!000)(?# empty string )(?=(?:,?(?:[0-9]{3,3}))+(?:[^0-9]|$))")\r

2300

# 1-9 (not 0) with 2, 5 or 8 etc digits to go = "N-hundred-and" :\r

2301

wordRegexps.append(r"[1-9](?=[0-9][0-9](?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")\r

2302

# + 0 with 2 digits to go when preceded by digits = "and", as long as followed by at least one non-0:\r

2303

wordRegexps.append(r"(?<=[0-9,])0(?=(?:[0-9][1-9]|[1-9][0-9])(?:[^0-9,]|$))")\r

2304

# 1 or 2 digits with 0,3,6.. to go = "seventy-six" or whatever, as long as they're not both 0 :\r

2305

wordRegexps.append(r"(?:0[1-9]|[1-9][0-9]?)(?=(?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")\r

2306

# 0 by itself (not preceded by digits) = "nought" :\r

2307

wordRegexps.append(r"(?<![0-9])0(?=[^0-9]|$)")\r

2308

wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:of|on|in|that|with|for|was) (?:the|a)(?= )")\r

2309

wordRegexps.insert(0,"(?:Of|On|In|That|With|For|Was) (?:the|a)(?= )")\r

2310

wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])not a(?= )")\r

2311

wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:some|that) one(?= )")\r

2312

wordRegexps.insert(0,"(?:Some|That) one(?= )")\r

2313

text2 = text\r

2314

else: text2 = re.sub(r"\.?[0-9]+","",text) # unknown eSpeak version: don't annotate the numbers\r

2315

response = pipeThroughEspeak(text2)\r

2316

if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'\r

2317

gradint_cgi = os.environ.get("RUBY_GRADINT_CGI","")\r

2318

if gradint_cgi:\r

2319

linkStart,linkEnd = lambda w:maybe_bytes('<a href="',w)+maybe_bytes(gradint_cgi,w)+maybe_bytes('?js=[[',w)+w.replace(maybe_bytes('%',w),maybe_bytes('%25',w)).replace(maybe_bytes('&',w),maybe_bytes('%26',w))+maybe_bytes(']]&jsl=en" onclick="return h5a(this);">',w), '</a>'\r

2320

print(r"""<script><!-- // HTML5-audio function\r

2321

function h5a(link) {\r

2322

if (document.createElement) {\r

2323

var ae = document.createElement('audio');\r

2324

if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/mpeg'))) {\r

2325

ae.setAttribute('src', link.href);\r

2326

ae.play(); return false;\r

2327

} else if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/ogg'))) {\r

2328

ae.setAttribute('src', link.href+"&filetype=ogg");\r

2329

ae.play(); return false; }\r

2330

} return true; }\r

2331

//--></script>""")\r

2332

else: linkStart,linkEnd = lambda w:maybe_bytes("",w), ""\r

2333

rubyList = []\r

2334

for clause in parseIntoWordsAndClauses("espeak",response):\r

2335

for w in clause:\r

2336

converted = convert(w,"espeak",format)\r

2337

if not converted: continue # e.g. a lone _:_:\r

2338

m = markup_inline_word(format,converted)\r

2339

rubyList.append(linkStart(w)+m.replace(maybe_bytes("&",m),maybe_bytes("&",m)).replace(maybe_bytes("<",m),maybe_bytes("<",m))+maybe_bytes(linkEnd,w))\r

2340

rubyList.reverse() # so can pop() left-to-right order\r

2341

# Write out re.sub ourselves, because (1) some versions of the library (e.g. on 2.7.12) try to do some things in-place, and we're using previous-context regexps that aren't compatible with previous things having been already <ruby>'ified, and (2) if we match a 0-length string, re.finditer won't ALSO return a non-0 length match starting in the same place, and we want both (so we're using wordRegexps as a list rather than an | expression)\r

2342

matches = {}\r

2343

debug = False # if True, will add ruby title=(index of the regexp that matched)\r

2344

debugCount = 0\r

2345

for r in wordRegexps:\r

2346

for match in re.finditer(maybe_bytes(r,text),text):\r

2347

matches[(match.start(),match.end())] = debugCount\r

debugCount += 1\r

i = 0 ; r = []\r

def cmpFunc(a,b):\r

(s1,e1),(s2,e2) = a,b\r

2352

if s1<s2: return -1\r

2353

if s1>s2: return 1\r

2354

if e1>e2: return -1\r

2355

if e1<e2: return 1\r

2356

return 0\r

2357

for start,end in sorted(list(matches.keys()),cmpFunc):\r

2358

if start<i: continue # overlap??\r

2359

r.append(text[i:start])\r

2360

if start==end: m = " "\r

2361

else: m = text[start:end].replace(maybe_bytes("&",text),maybe_bytes("&",text)).replace(maybe_bytes("<",text),maybe_bytes("<",text))\r

2362

try: rt = rubyList.pop()\r

2363

except: rt = "ERROR" # we've lost synchronisation\r

2364

if debug: title = as_utf8(" title=")+as_utf8(str(matches[(start,end)]))\r

2365

else: title = as_utf8("")\r

2366

r.append(as_utf8("<ruby")+title+as_utf8("><rb>")+m+as_utf8("</rb><rt>")+rt+as_utf8("</rt></ruby>"))\r

2367

i = end\r

2368

r.append(text[i:])\r

2369

while rubyList: # oops, lost synchronisation the other way (TODO: show this per-paragraph? but don't call eSpeak too many times if processing many short paragraphs)\r

2370

r.append(as_utf8("<ruby><rb>ERROR</rb><rt>")+rubyList.pop()+as_utf8("</rt></ruby>"))\r

2371

out = as_utf8("").join(r)\r

2372

if not out.endswith(as_utf8("\n")): out += as_utf8("\n")\r

2373

getBuf(sys.stdout).write(out)\r

2374

\r

2375

def pipeThroughEspeak(inpt):\r

2376

"Writes inpt to espeak -q -x (in chunks if necessary) and returns the result"\r

2377

assert type(inpt)==bytes\r

2378

bufsize = 8192 # careful not to set this too big, as the OS might limit it (TODO can we check?)\r

2379

ret = []\r

2380

while len(inpt) > bufsize:\r

2381

splitAt = inpt.rfind('\n',0,bufsize)+1\r

2382

if not splitAt: # no newline, try to split on space\r

2383

splitAt = inpt.rfind(' ',0,bufsize)+1\r

2384

if not splitAt:\r

2385

sys.stderr.write("Note: had to split eSpeak input and couldn't find a newline or space to do it on\n")\r

2386

splitAt = bufsize\r

2387

response = pipeThroughEspeak(inpt[:splitAt])\r

2388

if not '\n' in response.rstrip() and 'command' in response: return response.strip() # 'bad cmd' / 'cmd not found'\r

2389

ret.append(response) ; inpt=inpt[splitAt:]\r

2390

try: w,r=os.popen4("espeak -q -x",bufsize=bufsize) # Python 2\r

2391

except AttributeError: # Python 3\r

2392

import subprocess\r

2393

proc=subprocess.Popen(['espeak','-q','-x'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)\r

w = proc.stdin\r

r = None\r

if r:\r

getBuf(w).write(inpt) ; w.close()\r

2398

r = getBuf(r).read()\r

2399

else: # Python 3\r

2400

w.write(inpt)\r

2401

out,err=proc.communicate()\r

r = as_utf8("")\r

if out: r += out\r

if err: r += err\r

return as_utf8("\n").join(ret) + r\r

2406

\r

2407

def espeak_version_line(): return os.popen("espeak -h 2>&1").read().strip().split("\n")[0]\r

2408

\r

2409

def writeFormatHeader(format):\r

2410

"Writes a header for 'format' when outputting in all formats. Assumes the output MIGHT end up being more than one line."\r

2411

global writeFormatHeader_called\r

2412

if writeFormatHeader_called: print("")\r

2413

print(format)\r

2414

print('-'*len(format))\r

2415

writeFormatHeader_called = True\r

2416

writeFormatHeader_called = False\r

2417

\r

2418

def mainopt_check_variants(i):\r

2419

# undocumented (won't appear in help text)\r

2420

groups = {}\r

2421

for k,v in lexFormats['espeak'].items():\r

2422

if type(k)==str:\r

2423

intV = int(v)\r

2424

if not intV in consonants:\r

2425

groups.setdefault(intV,[]).append((v,k))\r

2426

i = groups.items() ; i.sort()\r

2427

for k,v in i:\r

2428

if len(v)==1: continue\r

2429

v.sort()\r

2430

while True:\r

2431

print("Group "+str(k))\r

2432

es = os.popen("espeak -x","w")\r

2433

getBuf(es).write(as_utf8('\n').join([markup_inline_word("espeak",w) for _,w in v]))\r

2434

del es\r

2435

if not int(str(input("Again? 1/0: "))): break\r

2436

\r

2437

def mainopt_check_for_similar_formats(i):\r

2438

# undocumented (won't appear in help text)\r

2439

items = lexFormats.items() ; r = []\r

2440

while items:\r

2441

k1,dic1 = items[0]\r

2442

for k2,dic2 in items[1:]:\r

2443

diff = 0\r

2444

for kk,vv in dic1.items():\r

2445

if not type(kk)==int: continue\r

2446

if kk==syllable_separator: continue\r

2447

if not dic2.get(kk,"!"+vv)==vv: diff += 1\r

2448

r.append((diff,k1,k2))\r

2449

items = items[1:]\r

2450

r.sort() ; had = set()\r

2451

for diffs,format1,format2 in r:\r

2452

if format1 in had and format2 in had: continue\r

2453

had.add(format1) ; had.add(format2)\r

2454

if "names" in had: break\r

2455

print(str(diffs)+" phoneme differences between "+format1+" and "+format2)\r

2456

\r

2457

def festival_group_stress(pronunc):\r

2458

"Special-case cleanup_func for the Festival format"\r

2459

# TODO: do we ever need to add extra consonants to the\r

2460

# previous group instead of the next group? (not sure\r

2461

# what difference it makes to the synthesis, but it\r

2462

# might make the entry a bit more readable)\r

2463

groups = [] ; thisGroup = [[],'0',False] # phon,stress,complete\r

2464

for phon in pronunc.split():\r

2465

if phon in ['0','1','2']:\r

2466

if groups and phon >= groups[-1][1]:\r

2467

groups[-1][1]=phon\r

2468

continue\r

2469

thisGroup[0].append(phon)\r

2470

if phon[:1] in 'aeiou@':\r

2471

thisGroup[2]=True\r

2472

groups.append(thisGroup)\r

2473

thisGroup = [[],'0',False]\r

2474

if thisGroup[0]: groups.append(thisGroup)\r

2475

if len(groups)>=2 and not groups[-1][2]:\r

2476

groups[-2][0] += groups[-1][0]\r

2477

del groups[-1]\r

2478

return "("+' '.join(("(("+' '.join(g[0])+') '+g[1]+")") for g in groups)+")"\r

2479

\r

2480

def mainopt_convert(i):\r

2481

"""*<from-format> <to-format>\r

2482

Convert a user lexicon (generally from its default filename; if this cannot be found then lexconvert will tell you what it should be).\r

2483

E.g.: python lexconvert.py --convert festival cepstral"""\r

2484

fromFormat = sys.argv[i+1]\r

2485

toFormat = sys.argv[i+2]\r

2486

if fromFormat==toFormat: return "Cannot convert a lexicon to its own format (that could result in it being truncated)"\r

2487

if toFormat=="mac-uk": return "Cannot permanently save a Mac-UK lexicon; please use the --mac-uk option to read text"\r

2488

if toFormat=="example": return "Cannot overwrite the built-in example lexicon"\r

2489

for f in [fromFormat,toFormat]:\r

2490

if not f in lexFormats: return "No such format "+repr(f)+" (use --formats to see a list of formats)"\r

2491

try:\r

2492

fname=getSetting(toFormat,"lex_filename")\r

2493

getSetting(toFormat,"lex_entry_format") # convert_user_lexicon will need this\r

2494

except KeyError: fname = None\r

2495

if not fname: return "Write support for lexicons of format '%s' not yet implemented (need at least lex_filename and lex_entry_format); try using --phones or --phones2phones options instead" % (toFormat,)\r

2496

if toFormat=="espeak":\r

2497

assert fname=="en_extra", "If you changed eSpeak's lex_filename in the table you also need to change the code below"\r

2498

if os.system("mv en_extra en_extra~ && (grep \" // \" en_extra~ || true) > en_extra"): sys.stderr.write("Warning: en_extra not found, making a new one\n(espeak compile will probably fail in this directory)\n") # otherwise keep the commented entries, so can incrementally update the user lexicon only\r

2499

outFile=open(fname,"a")\r

else:\r

l = 0\r

try:\r

f = open(fname)\r

l = getBuf(f).read()\r

2505

del f\r

2506

except: pass\r

2507

assert not l, "File "+replHome(fname)+" already exists and is not empty; are you sure you want to overwrite it? (Delete it first if so)" # (if you run with python -O then this is ignored, as are some other checks so be careful)\r

2508

outFile=open(fname,"w")\r

2509

print ("Writing %s lexicon entries to %s file %s" % (fromFormat,toFormat,fname))\r

2510

try: convert_user_lexicon(fromFormat,toFormat,outFile)\r

2511

except Message:\r

2512

print (" - error, deleting "+fname)\r

2513

os.remove(fname) ; raise\r

2514

\r

2515

def mainopt_festival_dictionary_to_espeak(i):\r

2516

"""<location>\r

2517

Convert the Festival Oxford Advanced Learners Dictionary (OALD) pronunciation lexicon to eSpeak.\r

2518

You need to specify the location of the OALD file in <location>,\r

2519

e.g. for Debian festlex-oald package: python lexconvert.py --festival-dictionary-to-espeak /usr/share/festival/dicts/oald/all.scm\r

2520

or if you can't install the Debian package, try downloading http://ftp.debian.org/debian/pool/non-free/f/festlex-oald/festlex-oald_1.4.0.orig.tar.gz, unpack it into /tmp, and do: python lexconvert.py --festival-dictionary-to-espeak /tmp/festival/lib/dicts/oald/oald-0.4.out\r

2521

In all cases you need to cd to the eSpeak source directory before running this. en_extra will be overwritten. Converter will also read your ~/.festivalrc if it exists. (You can later incrementally update from ~/.festivalrc using the --convert option; the entries from the system dictionary will not be overwritten in this case.) Specify --without-check to bypass checking the existing eSpeak pronunciation for OALD entries (much faster, but makes a larger file and in some cases compromises the pronunciation quality)."""\r

2522

try: festival_location=sys.argv[i+1]\r

2523

except IndexError: return "Error: --festival-dictionary-to-espeak must be followed by the location of the festival OALD file (see help text)"\r

2524

try: open(festival_location)\r

2525

except: return "Error: The specified OALD location '"+festival_location+"' could not be opened"\r

2526

try: open("en_list")\r

2527

except: return "Error: en_list could not be opened (did you remember to cd to the eSpeak dictsource directory first?"\r

2528

convert_system_festival_dictionary_to_espeak(festival_location,not '--without-check' in sys.argv,not os.system("test -e ~/.festivalrc"))\r

2529

\r

2530

def mainopt_syllables(i):\r

2531

"""[<words>]\r

2532

Attempt to break 'words' into syllables for music lyrics (uses espeak to determine how many syllables are needed)"""\r

2533

# As explained on mainopt_ruby's help text, espeak -x output can't be relied on to always put a space between every input word. Rather than try to guess what espeak is going to do, here we simply put a newline after every input word instead. This might affect eSpeak's output (so not recommended for mainopt_ruby), but it should be OK for just counting the syllables. (Also, the assumption that the input words have been taken from song lyrics usefully rules out certain awkward punctuation cases.)\r

2534

for txt in getInputText(i+1,"word(s)",'maybe'):\r

2535

words=txt.split()\r

2536

response = pipeThroughEspeak(as_utf8('\n').join(as_utf8(w) for w in words).replace(as_utf8("!"),as_utf8("")).replace(as_utf8(":"),as_utf8("")).replace(as_utf8("."),as_utf8("")))\r

2537

if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'\r

2538

rrr = response.split(as_utf8("\n"))\r

2539

print (" ".join([hyphenate(word,sylcount(convert(line,"espeak","example"))) for word,line in zip(words,filter(lambda x:x,rrr))]))\r

2540

sys.stdout.flush() # in case piped\r

2541

\r

2542

def wordSeparator(format):\r

2543

"""Returns the effective word separator of format (remembering that it defaults to same as phoneme_separator"""\r

2544

return checkSetting(format,"word_separator",checkSetting(format,"phoneme_separator"," "))\r

2545

\r

2546

def mainopt_phones2phones(i):\r

2547

"""*<format1> <format2> [<phonemes in format1>]\r

2548

Perform a one-off conversion of phonemes from format1 to format2 (format2 can be 'all' if you want)""" # If format1 is 'example' and you don't specify phonemes, we take the words from the example lexicon. But don't say that in the help string because it might confuse the issue about phonemes being optional on the command line and prompted for if not specified and stdin is not piped in all formats other than 'example'.\r

2549

format1,format2 = sys.argv[i+1],sys.argv[i+2]\r

2550

if not format1 in lexFormats: return "No such format "+repr(format1)+" (use --formats to see a list of formats)"\r

2551

if not format2 in lexFormats and not format2=="all": return "No such format "+repr(format2)+" (use --formats to see a list of formats)"\r

2552

if format1=="example" and len(sys.argv)<=i+3:\r

2553

if stdin_is_terminal(): txt=""\r

2554

else: txt=getBuf(sys.stdin).read() # and it might still be ""\r

2555

if txt: parseIntoWordsAndClauses(format1,txt)\r

2556

else: clauses=[[x[1]] for x in getSetting('example','lex_read_function')()]\r

2557

else: clauses = parseIntoWordsAndClauses(format1,getInputText(i+3,"phonemes in "+format1+" format"))\r

2558

if format2=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")\r

2559

else: formats = [format2]\r

2560

for format2 in formats:\r

2561

if len(formats)>1: writeFormatHeader(format2)\r

2562

getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_header")))\r

2563

output_clauses(format2,convert(clauses,format1,format2))\r

2564

getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_footer"))) ; print("")\r

2565

\r

2566

def parseIntoWordsAndClauses(format,phones):\r

2567

"Returns list of clauses, each of which is a list of words, assuming 'phones' are in format 'format'"\r

2568

wordSep = checkSetting(format,"word_separator") # don't use wordSeparator() here - we're splitting, not joining, so we don't want it to default to phoneme_separator\r

2569

clauseSep = checkSetting(format,"clause_separator","\n")\r

2570

def s(sep):\r

2571

if sep==" ": return None # " " means ANY whitespace (TODO: document this?)\r

2572

else: return maybe_bytes(sep,phones)\r

2573

if clauseSep and type(clauseSep) in [bytes,unicode]:\r

2574

clauses = phones.split(s(clauseSep))\r

2575

else: clauses = [phones]\r

2576

for i in range(len(clauses)):\r

2577

if wordSep: clauses[i]=clauses[i].split(s(wordSep))\r

2578

else: clauses[i] = [clauses[i]]\r

2579

clauses[i] = list(filter(lambda x:x, clauses[i]))\r

2580

return list(filter(lambda x:x,clauses))\r

2581

\r

2582

def mainopt_mac_uk(i):\r

2583

"""<from-format> [<text>]\r

2584

Speak text in Mac OS 10.7+ British voices while using a lexicon converted in from <from-format>. As these voices do not have user-modifiable lexicons, lexconvert must binary-patch your system's master lexicon; this is at your own risk! (Superuser privileges are needed the first time. A backup of the system file is made, and all changes are restored on normal exit but if you force-quit then you might need to restore the backup manually. Text speaking needs to be under lexconvert's control because it usually has to change the input words to make them fit the available space in the binary lexicon.) By default the Daniel voice is used; Emily or Serena can be selected by setting the MACUK_VOICE environment variable."""\r

2585

# If you have xterm etc, then text will also be printed, with words from the altered lexicon underlined.\r

2586

assert sys.version_info[0]==2, "--mac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"\r

2587

fromFormat = sys.argv[i+1]\r

2588

if not fromFormat in lexFormats: return "No such format "+repr(fromFormat)+" (use --formats to see a list of formats)"\r

2589

lex = get_macuk_lexicon(fromFormat)\r

2590

try:\r

2591

for line in getInputText(i+2,"text",True):\r

2592

m = MacBritish_System_Lexicon(line,os.environ.get("MACUK_VOICE","Daniel"))\r

2593

try: m.readWithLex(lex)\r

2594

finally: m.close()\r

2595

except KeyboardInterrupt:\r

2596

sys.stderr.write("Interrupted\n")\r

2597

\r

2598

class Counter(object):\r

2599

"A simple class with two static members, count and subcount, for use by the consonant(), vowel() and other() functions"\r

2600

c=sc=0\r

2601

def other():\r

2602

"Used by Phonemes() when creating something that is neither a vowel nor a consonant, e.g. a stress mark"\r

2603

Counter.c += 1 ; Counter.sc=0 ; return Counter.c\r

2604

consonants = set() ; mainVowels = set()\r

2605

def consonant():\r

2606

"Used by Phonemes() when creating a consonant"\r

2607

r = other() ; consonants.add(r) ; return r\r

2608

def vowel():\r

2609

"Used by Phonemes() when creating a vowel"\r

2610

r = other() ; mainVowels.add(r) ; return r\r

2611

def opt_vowel():\r

2612

"Used by Phonemes() when creating an optional vowel (one that has no warning issued if some format doesn't support it)"\r

2613

return other()\r

2614

def variant():\r

2615

"Used by Phonemes() when creating a variant of the just-defined vowel/consonant/etc"\r

2616

Counter.sc += 1\r

2617

while str(Counter.sc).endswith('0'): Counter.sc += 1\r

2618

return 0, float('%d.%d' % (Counter.c,Counter.sc))\r

2619

# the 0 is so we can say _, name = variant()\r

2620

# so as to get some extra indentation\r

2621

\r

2622

def ifset(var,a,b=""):\r

2623

"Checks the environment variable var; if it is set (non-empty), return a, otherwise return b. Used in LexFormats to create tables with variations set by the environment."\r

2624

import os\r

2625

if os.environ.get(var,""): return a\r

2626

else: return b\r

2627

\r

2628

def speakjet(symbol,opcode):\r

2629

"Special-case function for the Speakjet table"\r

2630

assert type(opcode)==int\r

2631

if ifset('SPEAKJET_BINARY',1):\r

2632

assert not ifset('SPEAKJET_SYM',1), "Cannot set both SPEAKJET_SYM and SPEAKJET_BINARY"\r

2633

return chr(opcode)\r

2634

else: return ifset('SPEAKJET_SYM',symbol,str(opcode))\r

2635

\r

2636

def makeDic(doc,*args,**kwargs):\r

2637

"Make a dictionary with a doc string, default-bidirectional mappings and extra settings; see LexFormats for how this is used."\r

2638

assert type(doc)==str, "doc must be a string"\r

2639

d = {} ; duplicates = set()\r

2640

for a in args:\r

2641

assert type(a)==tuple and (len(a)==2 or len(a)==3)\r

2642

k=a[0]\r

2643

if k in d: duplicates.add(k)\r

2644

v=a[1]\r

2645

assert (type(k) in [bytes,unicode] and type(v) in [int,float]) or (type(v) in [bytes,unicode] and type(k) in [int,float]), "Wrong types "+repr(a)+" (did you forget a _, before calling variant() or something?)"\r

2646

d[k] = v\r

2647

if type(k)==unicode: d[as_utf8(k)] = v\r

2648

if len(a)==3: bidir=a[2]\r

2649

else: bidir=True\r

2650

if bidir:\r

2651

# (k,v,True) = both (k,v) and (v,k)\r

2652

if v in d: duplicates.add(v)\r

2653

d[v] = k\r

2654

assert not duplicates, " Duplicate key(s) in "+repr(doc)+": "+", ".join((repr(dup)+"".join(" (="+g+")" for g,val in globals().items() if val==dup)) for dup in sorted(list(duplicates)))+". Did you forget a ,False to suppress bidirectional mapping?" # by the way, Python does not detect duplicate keys in {...} notation - it just lets you overwrite\r

2655

missing = [l for l in (list(consonants)+list(mainVowels)) if not l in d]\r

2656

# did_approx = False\r

2657

if missing and 'approximate_missing' in kwargs:\r

2658

for miss,approxTo in [\r

2659

# TODO: put this table somewhere else?\r

2660

# (If the thing on the right is just 1 item, we could make the thing on the left a variant of it. But that might not be a good idea unless they're really very close, since if it's a variant then the substitution is done without warning even if approximate_missing is not set.)\r

2661

(a_as_in_ago, [u_as_in_but]),\r

2662

(a_as_in_air, [e_as_in_them,r]),\r

2663

(ear, [e_as_in_eat,u_as_in_but]),\r

2664

(oor_as_in_poor, [close_to_or]), # TODO: ,r?\r

2665

(a_as_in_ah,[a_as_in_apple]), # this seems to be missing in some American voices (DecTalk, Keynote, SAM); TODO: is this the best approximation we can do?\r

2666

(a_as_in_apple,[a_as_in_ah]), # the reverse of the above, for Devanagari\r

2667

(o_as_in_orange,[oo_as_in_food]),(o_as_in_go,[oo_as_in_food]),(oy_as_in_toy,[oo_as_in_food,i_as_in_it]),(o_as_in_now,[a_as_in_ah, w]),(e_as_in_herd,[u_as_in_but,u_as_in_but]),(ar_as_in_year,[u_as_in_but,u_as_in_but]),(eye,[a_as_in_ah,y]),(th_as_in_think,[th_as_in_them]), # (Devanagari: is this really the best we can do?)\r

2668

]:\r

2669

if miss in missing and all(x in d for x in approxTo):\r

2670

d[miss]=maybe_bytes(kwargs.get("phoneme_separator"," "),d[approxTo[0]]).join(d[x] for x in approxTo)\r

2671

# did_approx = True\r

2672

missing.remove(miss)\r

2673

# if did_approx: doc="(approx.) "+doc # and see also the code in makeVariantDic. Commenting out because this is misleading: the formats where we didn't do a did_approx might also contain approximations of some kind. Incidentally there are some British English voices that need approximate_missing (e.g. Apollo 2)\r

2674

d[("settings","doc")] = doc\r

2675

if missing:\r

2676

import sys ; sys.stderr.write("WARNING: Some non-optional vowels/consonants are missing from "+repr(doc)+"\nThe following are missing: "+", ".join("/".join(g for g,val in globals().items() if val==m) for m in missing)+"\n")\r

2677

for k,v in kwargs.items(): d[('settings',k)] = v\r

2678

assert type(d.get(('settings','cleanup_regexps'),[]))==list, "cleanup_regexps must be a list" # not one tuple\r

2679

assert type(d.get(('settings','cvtOut_regexps'),[]))==list, "cvtOut_regexps must be a list" # not one tuple\r

2680

wsep = d.get(('settings','word_separator'),None)\r

2681

psep = d.get(('settings','phoneme_separator'),' ')\r

2682

if not wsep==None: assert not wsep in d, "word_separator duplicates with a key in "+repr(doc)\r

2683

if not psep==None: assert not psep in d, "phoneme_separator duplicates with a key (did you forget to change the default, or to add a ,False somewhere?) in "+repr(doc)\r

2684

global lastDictionaryMade ; lastDictionaryMade = d\r

2685

return d\r

2686

def makeVariantDic(doc,*args,**kwargs):\r

2687

"Like makeDic but create a new 'variant' version of the last-made dictionary, modifying some phonemes and settings (and giving it a new doc string) but keeping everything else the same. Any list settings (e.g. cleanup_regexps) are ADDED to by the variant; other settings and phonemes are REPLACED if they are specified in the variant. If you don't want subsequent variants to inherit the changes made by this variant, add noInherit=True to the keyword args."\r

2688

global lastDictionaryMade\r

2689

ldmOld = lastDictionaryMade\r

2690

toUpdate = lastDictionaryMade.copy()\r

2691

global mainVowels,consonants\r

2692

oldV,oldC = mainVowels,consonants\r

2693

mainVowels,consonants = [],[] # so makeDic doesn't complain if some vowels/consonants are missing\r

2694

if 'noInherit' in kwargs:\r

2695

noInherit = kwargs['noInherit']\r

2696

del kwargs['noInherit']\r

2697

else: noInherit = False\r

2698

d = makeDic(doc,*args,**kwargs)\r

2699

if noInherit: lastDictionaryMade = ldmOld\r

2700

mainVowels,consonants = oldV,oldC\r

2701

# if toUpdate[("settings","doc")].startswith("(approx.) ") and not d[("settings","doc")].startswith("(approx.) "): d[("settings","doc")]="(approx.) "+d[("settings","doc")] # TODO: always?\r

2702

for k,v in toUpdate.items():\r

2703

if type(v)==list and k in d: d[k] = v+d[k]\r

2704

toUpdate.update(d) ; return toUpdate\r

2705

def getSetting(formatName,settingName):\r

2706

"Gets a setting from lexFormats, exception if not there"\r

2707

return lexFormats[formatName][('settings',settingName)]\r

2708

def checkSetting(formatName,settingName,default=""):\r

2709

"Gets a setting from lexFormats, default if not there"\r

2710

return lexFormats[formatName].get(('settings',settingName),default)\r

2711

\r

2712

import sys,re,os\r

2713

try: from subprocess import getoutput\r

2714

except: from commands import getoutput # Python 2\r

2715

try: bytes # Python 3 and newer Python 2\r

2716

except: bytes = str # older Python 2\r

2717

try: unicode # Python 2\r

2718

except: # Python 3\r

2719

unicode,unichr,xrange = str,chr,range\r

2720

def chr(x): return bytes([x])\r

2721

_builtin_sorted = sorted\r

2722

from functools import cmp_to_key\r

2723

def sorted(l,theCmp=None):\r

2724

if theCmp:\r

2725

return _builtin_sorted(l,key=cmp_to_key(theCmp))\r

2726

else: return _builtin_sorted(l)\r

2727

assert sys.version_info[1] > 4, "lexconvert cannot run on Python 3.4 due to lack of byte-string percent formatting (PEP 461). Please use Python 3.5+ or stick with Python 2."\r

2728

def getBuf(f):\r

2729

"Return a buffer to which bytes may be written, for Python 2 and 3 compatibility"\r

2730

try: return f.buffer # Python 3\r

2731

except AttributeError: return f # Python 2\r

2732

\r

2733

cached_sourceName,cached_destName,cached_dict = None,None,None\r

2734

def make_dictionary(sourceName,destName):\r

2735

"Uses lexFormats to make a mapping dictionary from a particular source format to a particular dest format, and also sets module variables for that particular conversion (TODO: put those module vars into an object in case someone wants to use this code in a multithreaded server)"\r

2736

global cached_sourceName,cached_destName,cached_dict\r

2737

if (sourceName,destName) == (cached_sourceName,cached_destName): return cached_dict\r

2738

source = lexFormats[sourceName]\r

2739

dest = lexFormats[destName]\r

2740

d = {}\r

2741

global dest_consonants ; dest_consonants = set()\r

2742

global dest_syllable_sep ; dest_syllable_sep = dest.get(syllable_separator,"")\r

2743

global implicit_vowel_before_NL\r

2744

implicit_vowel_before_NL = None\r

2745

for k,v in source.items():\r

2746

if type(k)==tuple: continue # settings\r

2747

if type(v) in [bytes,unicode]: continue # (num->string entries are for converting IN to source; we want the string->num entries for converting out)\r

2748

if not v in dest: v = int(v) # (try the main version of a variant)\r

2749

if not v in dest: continue # (haven't got it - will have to ignore or break into parts)\r

2750

assert type(k) in [bytes,unicode]\r

2751

d[k] = dest[v]\r

2752

if int(v) in consonants: dest_consonants.add(d[k])\r

2753

if int(v)==e_as_in_herd and (not implicit_vowel_before_NL or v==int(v)): # TODO: or u_as_in_but ? used by festival and some other synths before words ending 'n' or 'l' (see usage of implicit_vowel_before_NL later)\r

2754

implicit_vowel_before_NL = d[k]\r

2755

d[as_utf8(k)] = d[k]\r

2756

try: d[as_unicode(k)] = d[k]\r

2757

except UnicodeDecodeError: pass\r

2758

try:\r

2759

if any(type(v)==unicode for v in d.values()): d,dest_consonants=dict((k,as_unicode(v)) for k,v in d.items()),set(as_unicode(v) for v in dest_consonants) # Python 2: if ANY dest are Unicode, make them ALL Unicode\r

2760

except UnicodeDecodeError: d,dest_consonants=dict((k,as_utf8(v)) for k,v in d.items()),set(as_utf8(v) for v in dest_consonants) # ... or make them ALL byte-strings if some were binary and not readable as UTF-8\r

2761

cached_sourceName,cached_destName,cached_dict=sourceName,destName,d\r

2762

return d\r

2763

\r

2764

warnedAlready = set()\r

2765

def convert(pronunc,source,dest):\r

2766

"Convert pronunc from source to dest. pronunc can be a string or a list; if a list then we'll recurse on each of the list elements and return a new list (this is meant for batch-converting clauses etc)"\r

2767

assert type(pronunc) in [bytes,unicode,list], type(pronunc)\r

2768

if source==dest: return pronunc # essential for --try experimentation with codes not yet supported by lexconvert\r

2769

if type(pronunc)==list: return [convert(p,source,dest) for p in pronunc]\r

2770

func = checkSetting(source,'cvtOut_func')\r

2771

if func: pronunc=func(pronunc)\r

2772

for s,r in checkSetting(source,'cvtOut_regexps'):\r

2773

pronunc=re.sub(maybe_bytes(s,pronunc),maybe_bytes(r,pronunc),pronunc)\r

2774

ret = [] ; toAddAfter = None\r

2775

dictionary = make_dictionary(source,dest)\r

2776

maxLen=max(len(l) for l in dictionary.keys())\r

2777

debugInfo=""\r

2778

separator = checkSetting(dest,'phoneme_separator',' ')\r

2779

safe_to_drop = checkSetting(source,"safe_to_drop_characters")\r

2780

while pronunc:\r

2781

for lettersToTry in range(maxLen,-1,-1):\r

2782

if not lettersToTry:\r

2783

if safe_to_drop==True: pass\r

2784

elif (not safe_to_drop) or not pronunc[:1] in maybe_bytes(safe_to_drop,pronunc) and not (pronunc[:1],debugInfo) in warnedAlready:\r

2785

warnedAlready.add((pronunc[:1],debugInfo))\r

2786

sys.stderr.write("Warning: ignoring "+source+" character "+repr(pronunc[:1])+debugInfo+" (unsupported in "+dest+")\n")\r

2787

pronunc=pronunc[1:] # ignore\r

2788

elif pronunc[:lettersToTry] in dictionary:\r

2789

debugInfo=" after "+as_printable(pronunc[:lettersToTry])\r

2790

toAdd=dictionary[pronunc[:lettersToTry]]\r

2791

assert type(toAdd) in [bytes,unicode], type(toAdd)\r

2792

isStressMark=(toAdd and toAdd in [maybe_bytes(lexFormats[dest].get(primary_stress,''),toAdd),maybe_bytes(lexFormats[dest].get(secondary_stress,''),toAdd)])\r

2793

if toAdd==maybe_bytes(lexFormats[dest].get(syllable_separator,''),toAdd): pass\r

2794

elif isStressMark and not checkSetting(dest,"stress_comes_before_vowel"):\r

2795

if checkSetting(source,"stress_comes_before_vowel"): toAdd, toAddAfter = maybe_bytes("",toAdd),toAdd # move stress marks from before vowel to after\r

2796

else: # stress is already after, but:\r

2797

# With Cepstral synth (and kana-approx), stress mark should be placed EXACTLY after the vowel and not any later. Might as well do this for others also.\r

2798

r=len(ret)-1\r

2799

while ret[r] in dest_consonants or ret[r].endswith(maybe_bytes("*added",ret[r])): r -= 1 # (if that raises IndexError then the input had a stress mark before any vowel) ("*added" condition is there so that implicit vowels don't get the stress)\r

2800

ret.insert(r+1,toAdd) ; toAdd=maybe_bytes("",toAdd)\r

2801

elif isStressMark and not checkSetting(source,"stress_comes_before_vowel"): # it's a stress mark that should be moved from after the vowel to before it\r

2802

i=len(ret)\r

2803

while i and (ret[i-1] in dest_consonants or ret[i-1].endswith(maybe_bytes("*added",ret[i-1]))): i -= 1\r

2804

if i: i-=1\r

2805

ret.insert(i,toAdd)\r

2806

if dest_syllable_sep: ret.append(maybe_bytes(dest_syllable_sep,toAdd)) # (TODO: this assumes stress marks are at end of syllable rather than immediately after vowel; correct for Festival; check others; probably a harmless assumption though; mac-uk is better with syllable separators although espeak basically ignores them)\r

2807

toAdd = maybe_bytes("",toAdd)\r

2808

# attempt to sort out the festival dictionary's (and other's) implicit_vowel_before_NL\r

2809

elif implicit_vowel_before_NL and ret and ret[-1] and toAdd in [maybe_bytes('n',toAdd),maybe_bytes('l',toAdd)] and ret[-1] in dest_consonants: ret.append(maybe_bytes(implicit_vowel_before_NL,toAdd)+maybe_bytes('*added',toAdd))\r

2810

elif len(ret)>2 and ret[-2].endswith(maybe_bytes('*added',ret[-2])) and toAdd and not toAdd in dest_consonants and not toAdd==dest_syllable_sep: del ret[-2]\r

2811

if toAdd:\r

2812

# Add it, but if toAdd is multiple phonemes, try to put toAddAfter after the FIRST phoneme\r

2813

if separator: toAddList=toAdd.split(separator)\r

2814

else: toAddList = [toAdd] # TODO: won't work for formats that don't have a phoneme separator (doesn't really matter for eSpeak though)\r

2815

ret.append(toAddList[0])\r

2816

if toAddAfter and not toAddList[0] in dest_consonants:\r

2817

ret.append(toAddAfter)\r

2818

toAddAfter=None\r

2819

ret += toAddList[1:]\r

2820

pronunc=pronunc[lettersToTry:]\r

2821

break\r

2822

if toAddAfter: ret.append(toAddAfter)\r

2823

if ret and ret[-1]==dest_syllable_sep: del ret[-1] # spurious syllable separator at end\r

2824

if not ret: ret = ""\r

2825

else: ret=maybe_bytes(separator,ret[0]).join(ret).replace(maybe_bytes('*added',ret[0]),maybe_bytes('',ret[0]))\r

2826

for s,r in checkSetting(dest,'cleanup_regexps'):\r

2827

ret=re.sub(maybe_bytes(s,ret),maybe_bytes(r,ret),ret)\r

2828

func = checkSetting(dest,'cleanup_func')\r

2829

if func: return func(ret)\r

2830

else: return ret\r

2831

\r

2832

def unicode_preprocess(pronunc):\r

2833

"Special-case cvtOut_func for unicode-ipa etc: tries to catch \\uNNNN etc"\r

2834

if maybe_bytes("\\u",pronunc) in pronunc and not maybe_bytes('"',pronunc) in pronunc: # maybe \uNNNN copied from Gecko on X11, can just evaluate it to get the unicode\r

2835

# (NB make sure to quote the \'s if pasing in on the command line)\r

2836

try: pronunc=eval('u"'+pronunc+'"')\r

2837

except: pass\r

2838

else: # see if it makes sense as utf-8\r

2839

try: pronunc = pronunc.decode('utf-8')\r

except: pass\r

return pronunc\r

\r

def ascii_braille_to_unicode(a):\r

2844

"Special-case cleanup_func for braille-ipa (set by braille-ipa if BRAILLE_UNICODE is set). Converts Braille ASCII to Unicode dot patterns."\r

2845

d=dict(zip(list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)="),[unichr(c) for c in range(0x2800,0x2840)]))\r

2846

return u''.join(d.get(c,c) for c in list(a))\r

2847

def unicode_to_ascii_braille(u):\r

2848

d=dict(zip([unichr(c) for c in range(0x2800,0x2840)],list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)=")))\r

2849

r=''.join(d.get(c,c) for c in list(as_unicode(u)))\r

2850

if r.startswith(",7") and r.endswith("7'"): r=r[2:-2]\r

2851

return r\r

2852

\r

2853

def hiragana_to_katakana(u):\r

2854

"Special-case cleanup_func for kana-approx; converts all hiragana characters in unicode string 'u' into katakana if KANA_TYPE is set to anything beginning with a 'k'"\r

2855

assert type(u)==unicode\r

2856

if not os.environ.get("KANA_TYPE","").lower().startswith("k"): return u\r

2857

u = list(u)\r

2858

for i in xrange(len(u)):\r

2859

if 0x3041 <= ord(u[i]) <= 0x3096:\r

2860

u[i]=unichr(ord(u[i])+0x60)\r

2861

return u"".join(u)\r

2862

\r

2863

def espeak_probably_right_already(existing_pronunc,new_pronunc):\r

2864

"""Used by convert_system_festival_dictionary_to_espeak to compare a "new" pronunciation with eSpeak's existing pronunciation. As the transcription from OALD to eSpeak is only approximate, it could be that our new pronunciation is not identical to the existing one but the existing one is actually correct; try to detect when this happens by checking if the pronunciations are the same after some simplifications."""\r

2865

if existing_pronunc==new_pronunc: return True\r

2866

def simplify(pronunc): return \\r

2867

pronunc.replace(maybe_bytes(";",pronunc),maybe_bytes("",pronunc)).replace(maybe_bytes("%",pronunc),maybe_bytes("",pronunc)) \\r

2868

.replace(maybe_bytes("a2",pronunc),maybe_bytes("@",pronunc)) \\r

2869

.replace(maybe_bytes("3",pronunc),maybe_bytes("@",pronunc)) \\r

2870

.replace(maybe_bytes("L",pronunc),maybe_bytes("l",pronunc)) \\r

2871

.replace(maybe_bytes("I2",pronunc),maybe_bytes("i:",pronunc)) \\r

2872

.replace(maybe_bytes("I",pronunc),maybe_bytes("i:",pronunc)).replace(maybe_bytes("i@",pronunc),maybe_bytes("i:@",pronunc)) \\r

2873

.replace(maybe_bytes(",",pronunc),maybe_bytes("",pronunc)) \\r

2874

.replace(maybe_bytes("s",pronunc),maybe_bytes("z",pronunc)) \\r

2875

.replace(maybe_bytes("aa",pronunc),maybe_bytes("A:",pronunc)) \\r

2876

.replace(maybe_bytes("A@",pronunc),maybe_bytes("A:",pronunc)) \\r

2877

.replace(maybe_bytes("O@",pronunc),maybe_bytes("O:",pronunc)) \\r

2878

.replace(maybe_bytes("o@",pronunc),maybe_bytes("O:",pronunc)) \\r

2879

.replace(maybe_bytes("r-",pronunc),maybe_bytes("r",pronunc))\r

2880

# TODO: rewrite @ to 3 whenever not followed by a vowel?\r

2881

if as_printable(simplify(existing_pronunc))==as_printable(simplify(new_pronunc)): return True # almost the same, and festival @/a2 etc seems to be a bit ambiguous so leave it alone\r

2882

\r

2883

def parse_festival_dict(festival_location):\r

2884

"For OALD; yields word,part-of-speech,pronunciation"\r

2885

ret = []\r

2886

for line in open(festival_location):\r

2887

line=line.strip()\r

2888

if "((pos" in line: line=line[:line.index("((pos")]\r

2889

if line.startswith('( "'): line=line[3:]\r

2890

line=line.replace('"','').replace('(','').replace(')','')\r

2891

try:\r

2892

word, pos, pronunc = line.split(None,2)\r

2893

except ValueError: continue # malformed line\r

2894

if pos not in ['n','v','a','cc','dt','in','j','k','nil','prp','uh']: continue # two or more words\r

2895

yield (word.lower(), pos, pronunc)\r

2896

\r

2897

class Message(Exception): pass\r

2898

def convert_system_festival_dictionary_to_espeak(festival_location,check_existing_pronunciation,add_user_dictionary_also):\r

2899

"See mainopt_festival_dictionary_to_espeak"\r

2900

os.system("mv en_extra en_extra~") # start with blank 'extra' dictionary\r

2901

if check_existing_pronunciation: os.system("espeak --compile=en") # so that the pronunciation we're checking against is not influenced by a previous version of en_extra\r

2902

outFile=open("en_extra","w")\r

2903

print ("Reading dictionary lists")\r

2904

wordDic = {} ; ambiguous = {}\r

2905

el = open("en_list")\r

2906

for line in filter(lambda x:x.split() and not re.match(maybe_bytes(r'^[a-z]* *\$',x),x),getBuf(el).read().split(as_utf8('\n'))): ambiguous[line.split()[0]]=ambiguous[line.split()[0]+as_utf8('s')]=True # this stops the code below from overriding anything already in espeak's en_list. If taking out then you need to think carefully about words like "a", "the" etc.\r

2907

for word,pos,pronunc in parse_festival_dict(festival_location):\r

2908

pronunc=pronunc.replace("i@ 0 @ 0","ii ou 2 ").replace("i@ 0 u 0","ii ou ") # (hack for OALD's "radio"/"video"/"stereo"/"embryo" etc)\r

2909

pronunc=pronunc.replace("0","") # 0's not necessary, and OALD sometimes puts them in wrong places, confusing the converter\r

2910

if word in ['mosquitoes']: continue # OALD bug (TODO: any others?)\r

2911

if word in wordDic and not wordDic[word]==(pronunc,pos):\r

2912

ambiguous[as_utf8(word)] = True\r

2913

del wordDic[word] # better not go there\r

2914

if not as_utf8(word) in ambiguous:\r

2915

wordDic[word] = (pronunc, pos)\r

2916

toDel = []\r

2917

if check_existing_pronunciation:\r

2918

print ("Checking existing pronunciation")\r

2919

proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")\r

2920

wList = []\r

2921

progressCount=0 ; oldPercent=-1\r

2922

itemList = list(wordDic.items())\r

2923

# Make sure it's NOT sorted, to ensure eSpeak doesn't\r

2924

# cache pronunciation of previous word when add suffix\r

2925

# (which can subtly change eSpeak's pronunciation in\r

2926

# some versions of eSpeak, leading to\r

2927

# Python 2/3 differences as Python 3 sorts by default) :\r

2928

itemList.sort()\r

2929

i0,i1 = itemList[:int(len(itemList)/2)],itemList[int(len(itemList)/2):]\r

2930

itemList = []\r

2931

while i0 or i1:\r

2932

if i0: itemList.append(i0.pop())\r

2933

if i1: itemList.append(i1.pop())\r

2934

for word,(pronunc,pos) in itemList:\r

2935

if check_existing_pronunciation:\r

2936

percent = int(progressCount*100/len(wordDic))\r

2937

if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()\r

2938

oldPercent=percent\r

2939

progressCount += 1\r

2940

if not re.match("^[A-Za-z]*$",word): # (some versions of eSpeak also OK with "-", but not all)\r

2941

# contains special characters - better not go there\r

2942

toDel.append(word)\r

2943

elif word.startswith("plaque") or word in "friday saturday sunday tuesday thursday yesterday".split():\r

2944

# hack to accept eSpeak's pl'ak instead of pl'A:k - order was reversed in the March 2009 draft\r

2945

toDel.append(word)\r

2946

elif word[-1]=="s" and word[:-1] in wordDic:\r

2947

# unnecessary plural (espeak will pick up on them anyway)\r

2948

toDel.append(word)\r

2949

elif word.startswith("year") or "quarter" in word: toDel.append(word) # don't like festival's pronunciation of those (TODO: also 'memorial' why start with [m'I])\r

2950

elif check_existing_pronunciation:\r

2951

getBuf(proc).write(as_utf8(word)+as_utf8("\n"))\r

2952

proc.flush() # so the progress indicator works\r

2953

wList.append(word)\r

2954

if check_existing_pronunciation:\r

2955

proc.close() ; print("")\r

2956

oldPronDic = {}\r

2957

tp = open("/tmp/.pronunc")\r

2958

for k,v in zip(wList,getBuf(tp).read().split(as_utf8("\n"))): oldPronDic[k]=v.strip().replace(as_utf8(" "),as_utf8(""))\r

2959

for w in toDel: del wordDic[w]\r

2960

print ("Doing the conversion")\r

2961

lines_output = 0\r

2962

total_lines = 0\r

2963

not_output_because_ok = []\r

2964

items = list(wordDic.items()) ; items.sort() # necessary because of the hacks below which check for the presence of truncated versions of the word (want to have decided whether or not to output those truncated versions before reaching the hacks)\r

2965

for word,(pronunc,pos) in items:\r

2966

total_lines += 1\r

2967

new_e_pronunc = convert(pronunc,"festival","espeak")\r

2968

if new_e_pronunc.count("'")==2 and not '-' in word: new_e_pronunc=new_e_pronunc.replace("'",",",1) # if 2 primary accents then make the first one a secondary (except on hyphenated words)\r

2969

# TODO if not en-rp? - if (word.endswith("y") or word.endswith("ie")) and new_e_pronunc.endswith("i:"): new_e_pronunc=new_e_pronunc[:-2]+"I"\r

2970

unrelated_word = None\r

2971

if check_existing_pronunciation: espeakPronunc = oldPronDic.get(word,"")\r

2972

else: espeakPronunc = ""\r

2973

if word[-1]=='e' and word[:-1] in wordDic: unrelated_word, espeakPronunc = word[:-1],"" # hack: if word ends with 'e' and dropping the 'e' leaves a valid word that's also in the dictionary, we DON'T want to drop this word on the grounds that espeak already gets it right, because if we do then adding 's' to this word may cause espeak to add 's' to the OTHER word ('-es' rule).\r

2974

if espeak_probably_right_already(espeakPronunc,new_e_pronunc):\r

2975

not_output_because_ok.append(word)\r

2976

continue\r

2977

if not unrelated_word: lines_output += 1\r

2978

getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+as_utf8(new_e_pronunc)+as_utf8(" // from Festival's (")+as_utf8(pronunc)+as_utf8(")"))\r

2979

if espeakPronunc: getBuf(outFile).write(as_utf8(", not [[")+as_utf8(espeakPronunc)+as_utf8("]]"))\r

2980

elif unrelated_word: getBuf(outFile).write(as_utf8(" (here to stop espeak's affix rules getting confused by Festival's \"")+as_utf8(unrelated_word)+as_utf8("\")"))\r

2981

getBuf(outFile).write(as_utf8("\n"))\r

2982

print ("Corrected(?) %d entries out of %d" % (lines_output,total_lines))\r

2983

if add_user_dictionary_also: convert_user_lexicon("festival","espeak",outFile)\r

2984

outFile.close()\r

2985

os.system("espeak --compile=en")\r

2986

if not_output_because_ok:\r

2987

print ("Checking for unwanted side-effects of those corrections") # e.g. terrible as Terr + ible, inducing as in+Duce+ing\r

2988

proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")\r

2989

progressCount = 0\r

2990

for w in not_output_because_ok:\r

2991

getBuf(proc).write(as_utf8(w)+as_utf8("\n")) ; proc.flush()\r

2992

percent = int(progressCount*100/len(not_output_because_ok))\r

2993

if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()\r

2994

oldPercent = percent\r

2995

progressCount += 1\r

2996

proc.close()\r

2997

outFile=open("en_extra","a") # append to it\r

2998

tp = open("/tmp/.pronunc")\r

2999

for word,pronunc in zip(not_output_because_ok,getBuf(tp).read().split(as_utf8("\n"))):\r

3000

pronunc = pronunc.strip().replace(as_utf8(" "),as_utf8(""))\r

3001

if not pronunc==oldPronDic[word] and not espeak_probably_right_already(oldPronDic[word],pronunc):\r

3002

getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+oldPronDic[word]+as_utf8(" // (undo affix-side-effect from previous words that gave \"")+pronunc+as_utf8("\")\n"))\r

3003

outFile.close()\r

3004

os.system("espeak --compile=en")\r

3005

return not_output_because_ok\r

3006

\r

3007

def read_user_lexicon(fromFormat):\r

3008

"Calls the appropriate lex_read_function, opening lex_filename first if supplied"\r

3009

readFunction = checkSetting(fromFormat,"lex_read_function")\r

3010

if not readFunction: raise Message("Reading from '%s' lexicon file not yet implemented (no lex_read_function); try using --phones or --phones2phones options instead" % (fromFormat,))\r

3011

try:\r

3012

lexFilename = getSetting(fromFormat,"lex_filename")\r

3013

if lexFilename==None: lexfile = None # e.g. the example lexicon\r

3014

else:\r

3015

lexfile = open(lexFilename)\r

3016

if not os.environ.get("LEXCONVERT_OMIT_READING_FROM",""): print ("Reading from "+lexFilename) # TODO: document LEXCONVERT_OMIT_READING_FROM (might be useful for the --mac-uk option)\r

3017

except KeyError: lexfile = None # lex_read_function without lex_filename is allowed, if the read function can take null param and fetch the lexicon itself\r

3018

except IOError: raise Message(fromFormat+"'s lexicon is expected to be in a file called "+replHome(lexFilename)+" which could not be read - please fix and try again")\r

3019

return readFunction(lexfile)\r

3020

\r

3021

def replHome(fname):\r

3022

"Format fname for printing, substituting ~ for HOME if appropriate"\r

3023

h = os.environ.get('HOME','')\r

3024

if h and fname.startswith(h+os.sep):\r

3025

return "~"+fname[len(h):]\r

3026

else: return fname\r

3027

\r

3028

def get_macuk_lexicon(fromFormat):\r

3029

"Converts lexicon from fromFormat and returns a list suitable for MacBritish_System_Lexicon's readWithLex"\r

3030

return [(word,convert(pronunc,fromFormat,"mac-uk")) for word, pronunc in read_user_lexicon(fromFormat)]\r

3031

\r

3032

def as_utf8(s):\r

3033

if type(s)==unicode: return s.encode('utf-8')\r

3034

else: return s\r

3035

def as_unicode(s):\r

3036

if type(s)==unicode: return s\r

3037

else: return s.decode('utf-8')\r

3038

def maybe_bytes(s,i):\r

3039

"Python 2/3 compatibility: convert s to bytes if i is bytes"\r

3040

if type(i)==unicode: return s\r

3041

else: return as_utf8(s)\r

3042

def as_printable(s):\r

3043

if sys.version_info[0] < 3: return as_utf8(s)\r

3044

else: return as_utf8(s).decode('utf-8')\r

3045

\r

3046

def convert_user_lexicon(fromFormat,toFormat,outFile):\r

3047

"See mainopt_convert"\r

3048

lex = read_user_lexicon(fromFormat)\r

3049

lex_header = checkSetting(toFormat,"lex_header")\r

3050

if type(lex_header) in [bytes,unicode]: getBuf(outFile).write(as_utf8(lex_header))\r

3051

else: lex_header(outFile)\r

3052

entryFormat=getSetting(toFormat,"lex_entry_format")\r

3053

wordCase=checkSetting(toFormat,"lex_word_case")\r

3054

for word, pronunc in lex:\r

3055

pronunc = as_utf8(convert(pronunc,fromFormat,toFormat))\r

3056

if wordCase=="upper": word=word.upper()\r

3057

elif wordCase=="lower": word=word.lower()\r

3058

getBuf(outFile).write(as_utf8(entryFormat) % (as_utf8(word),as_utf8(pronunc))) # will work in Python 3.6, but not in Python 3.4 (e.g. on jessie) which cannot do % on byte-strings\r

3059

footer = checkSetting(toFormat,"lex_footer")\r

3060

if type(footer) in [bytes,unicode]: getBuf(outFile).write(as_utf8(footer))\r

3061

else: footer(outFile)\r

3062

\r

3063

def bbcMicro_partPhonemeCount(pronunc):\r

3064

"""Returns the number of 'part phonemes' (at least that's what I'm calling them) for the BBC Micro phonemes in pronunc. The *SPEAK command cannot take more than 117 part-phonemes at a time before saying "Line too long", and in some cases it takes less than that (I'm not sure why); 115 is a safer limit."""\r

3065

partCount = 0 ; pronunc0 = pronunc\r

3066

while pronunc:\r

3067

found = 0\r

3068

for p in ' ,AA,AE,AH,AI,AO,AW,AY,B,CH,CT,DH,DUX,D,EE,EH,ER,F,G,/H,IH,IX,IY,J,K,L,M,NX,N,OW,OL,OY,O,P,R,SH,S,TH,T,UH,/UL,/U,UW,UX,V,W,Y,ZH,Z'.split(','): # phonemes and space count, but pitch numbers do not count\r

3069

if pronunc.startswith(as_utf8(p)):\r

3070

partCount += {\r

3071

# *SPEAK can take 117 of most single-letter phonemes, or 116 (limited by the 232+6-character input limit) of most 2-letter phonemes\r

3072

'AW':2,'IY':2,'OW':2,'OL':2,'UW':2,'/UL':2, # *SPEAK can take 58 of these\r

3073

'DUX':3,'AY':3,'CH':3,'J':3,'OY':3, # *SPEAK can take 39 of these\r

3074

'CT':4, # *SPEAK can take 29 of these\r

3075

}.get(p,1)\r

3076

pronunc=pronunc[len(p):] ; found=1 ; break\r

3077

if not found:\r

3078

assert as_printable(pronunc[:1]) in '12345678',"Unrecognised BBC Micro phoneme at "+str(pronunc)+" in "+str(pronunc0)\r

3079

pronunc=pronunc[1:]\r

3080

return partCount\r

3081

\r

3082

def markup_inline_word(format,pronunc):\r

3083

"Returns pronunc with any necessary markup for putting it in a text (using the inline_format setting)"\r

3084

pronunc = as_utf8(pronunc) # UTF-8 output - ok for pasting into Firefox etc *IF* the terminal/X11 understands utf-8 (otherwise redirect to a file, point the browser at it, and set encoding to utf-8, or try --convert'ing which will o/p HTML)\r

3085

format = checkSetting(format,"inline_format","%s")\r

3086

if type(format) in [bytes,unicode]:\r

3087

if type(format)==unicode: format=format.encode('utf-8') # see above\r

3088

return format % pronunc\r

3089

else: return format(pronunc)\r

3090

def markup_doubleTalk_word(pronunc):\r

3091

"Special-case function set as inline_format in doubletalk (checks environment variables for command code)"\r

3092

cmd = os.environ.get('DTALK_COMMAND_CODE','')\r

3093

if cmd: cmd=chr(int(cmd))\r

3094

else: cmd = as_utf8('*')\r

3095

return as_utf8("%sD%s%sT") % (cmd,pronunc,cmd)\r

3096

def markup_bbcMicro_word(pronunc):\r

3097

"Special-case function set as inline_format in bbcmicro. Begins a new *SPEAK command when necessary. See also write_bbcmicro_phones."\r

3098

global bbc_partsSoFar,bbc_charsSoFar\r

3099

thisPartCount = bbcMicro_partPhonemeCount(pronunc)\r

3100

if (not bbc_partsSoFar or bbc_partsSoFar+thisPartCount > 115) or (not bbc_charsSoFar or bbc_charsSoFar+len(pronunc) > 238): # 238 is max len of BBC BASIC prompt (both the immediate prompt and the one with line number supplied by AUTO, in both BASIC II and BASIC IV); re other limit see bbcMicro_partPhonemeCount\r

3101

if bbc_charsSoFar: r="\n"\r

3102

else: r=""\r

3103

cmd="*SPEAK" # (could add a space if want to make it more readable, at the expense of an extra keystroke in the paste buffer; by the way, when not using the ROM version you must use *SPEAK not OS.("SPEAK"), at least on a Model B; seems OSCLI doesn't go through quite the same vectors as star)\r

3104

bbc_charsSoFar = len(cmd)+len(pronunc)+1 # +1 for the space that'll be after this word if we don't start a new line\r

3105

bbc_partsSoFar = thisPartCount+1 # ditto\r

3106

return as_utf8(r+cmd)+pronunc\r

3107

else:\r

3108

bbc_charsSoFar += len(pronunc)+1\r

3109

bbc_partsSoFar += thisPartCount+1\r

3110

return pronunc\r

3111

bbc_partsSoFar=bbc_charsSoFar=0\r

3112

\r

3113

def sylcount(example_format_festival):\r

3114

"""Tries to count the number of syllables in a Festival string (see mainopt_syllables). We treat @ as counting the same as the previous syllable (e.g. "fire", "power"), but this can vary in different songs, so the result will likely need a bit of proofreading."""\r

3115

count = inVowel = maybeCount = hadAt = 0\r

3116

festival = example_format_festival.split() # no brackets, emphasis by vowels, but spaces between each syllable\r

3117

for phone,i in zip(festival,range(len(festival))):\r

3118

if phone[:1] in "aeiou": inVowel=0 # unconditionally start new syllable\r

3119

if phone[:1] in "aeiou@12":\r

3120

if not inVowel: count += 1\r

3121

elif phone[:1]=="@" and not hadAt: maybeCount = 1 # (e.g. "loyal", but NOT '1', e.g. "world")\r

3122

if "@" in phone: hadAt = 1 # for words like "cheerful" ("i@ 1 @" counts as one)\r

3123

inVowel = 1\r

3124

if phone[:1]=="@" and i>=3 and festival[i-2:i]==["ai","1"] and festival[i-3] in ["s","h"]: # special rule for higher, Messiah, etc - like "fire" but usually 2 syllables\r

3125

maybeCount = 0 ; count += 1\r

3126

else:\r

3127

if not phone[:1] in "drz": count += maybeCount # not 'r/z' e.g. "ours", "fired" usually 1 syllable in songs, "desirable" usually 4 not 5\r

3128

# TODO steward? y u@ 1 d but usally 2 syllables\r

3129

inVowel = maybeCount = hadAt = 0\r

3130

return count\r

3131

def hyphenate(word,numSyls):\r

3132

"See mainopt_syllables"\r

3133

orig = word\r

3134

try: word,isu8 = word.decode('utf-8'),True\r

3135

except: isu8 = False\r

3136

pre=[] ; post=[]\r

3137

while word and not 'a'<=word[:1].lower()<='z':\r

3138

pre.append(word[:1]) ; word=word[1:]\r

3139

while word and not 'a'<=word[-1].lower()<='z':\r

3140

post.insert(0,word[-1:]) ; word=word[:-1]\r

3141

if numSyls>len(word): return orig # probably numbers or something\r

3142

l = int((len(word)+numSyls/2)/numSyls) ; syls = []\r

3143

for i in range(numSyls):\r

3144

if i==numSyls-1: syls.append(word[i*l:])\r

3145

else: syls.append(word[i*l:(i+1)*l])\r

3146

if len(syls)>1:\r

3147

if syls[-1].startswith('-') or (len(syls[-1])>2 and syls[-1][:1]==syls[-1][1:2] and not syls[-1][:1].lower() in "aeiou"):\r

3148

# repeated consonant at start - put one on previous\r

3149

# (or hyphen at start - move it to the previous)\r

3150

syls[-2] += syls[-1][:1]\r

3151

syls[-1] = syls[-1][1:]\r

3152

elif len(syls[-1])>2 and syls[-1][1]=='-':\r

3153

# better move this splitpoint after that hyphen (TODO: move more than one character?)\r

3154

syls[-2] += syls[-1][:2]\r

3155

syls[-1] = syls[-1][2:]\r

3156

elif ((len(syls[-2])>2 and syls[-2][-1]==syls[-2][-2] and not syls[-2][-1].lower() in "aeiou") \\r

3157

or (syls[-1] and syls[-1][:1].lower() in "aeiouy" and len(syls[-2])>2)) \\r

3158

and list(filter(lambda x:x.lower() in "aeiou",list(syls[-2][:-1]))):\r

3159

# repeated consonant at end - put one on next\r

3160

# or vowel on right: move a letter over (sometimes the right thing to do...)\r

3161

# (unless doing so leaves no vowels)\r

3162

syls[-1] = syls[-2][-1]+syls[-1]\r

3163

syls[-2] = syls[-2][:-1]\r

3164

word = ''.join(pre)+"- ".join(syls)+''.join(post)\r

3165

if isu8: word=word.encode('utf-8')\r

3166

return word\r

3167

\r

3168

def macSayCommand():\r

3169

"""Return the environment variable SAY_COMMAND if it is set and if it is non-empty, otherwise return "say".\r

3170

E.g. SAY_COMMAND="say -o file.aiff" (TODO: document this in the help text?)\r

3171

In Gradint you can set (e.g. if you have a ~/.festivalrc) extra_speech=[("en","python lexconvert.py --mac-uk festival")] ; extra_speech_tofile=[("en",'echo %s | SAY_COMMAND="say -o /tmp/said.aiff" python lexconvert.py --mac-uk festival && sox /tmp/said.aiff /tmp/said.wav',"/tmp/said.wav")]"""\r

3172

s = os.environ.get("SAY_COMMAND","")\r

if s: return s\r

else: return "say"\r

\r

def stdin_is_terminal():\r

3177

"Returns True if it seems the standard input is connected to a terminal (rather than piped from a file etc)"\r

3178

return (not hasattr(sys.stdin,"isatty")) or sys.stdin.isatty()\r

3179

\r

3180

def getInputText(i,prompt,as_iterable=False):\r

3181

"""Gets text either from the command line or from standard input. Issue prompt if there's nothing on the command line and standard input is connected to a tty instead of a pipe or file. If as_iterable, return an iterable object over the lines instead of reading and returning all text at once. If as_iterable=='maybe', return the iterable but if not reading from a tty then read everything into one item."""\r

3182

txt = ' '.join(sys.argv[i:])\r

3183

if txt:\r

3184

if as_iterable=='maybe': return [txt]\r

3185

elif as_iterable: return txt.split('\n')\r

3186

else: return txt\r

3187

if stdin_is_terminal(): sys.stderr.write("Enter "+prompt+" (EOF when done)\n")\r

3188

elif as_iterable=='maybe': return [getBuf(sys.stdin).read()]\r

3189

if as_iterable: return my_xreadlines()\r

3190

else:\r

3191

try: return getBuf(sys.stdin).read()\r

3192

except KeyboardInterrupt: raise SystemExit\r

3193

\r

3194

try: raw_input # Python 2\r

3195

except NameError: raw_input = input # Python 3\r

3196

def my_xreadlines():\r

3197

"On some platforms this might be a bit more responsive than sys.stdin.xreadlines"\r

3198

while True:\r

3199

try: yield raw_input()\r

3200

except EOFError: return\r

3201

except KeyboardInterrupt: raise SystemExit\r

3202

\r

3203

def output_clauses(format,clauses):\r

3204

"Writes out clauses and words in format 'format' (clauses is a list of lists of words in the phones of 'format'). By default, calls markup_inline_word and join as appropriate. If however the format's 'clause_separator' has been set to a special case, calls that."\r

3205

if checkSetting(format,"output_is_binary") and hasattr(sys.stdout,"isatty") and sys.stdout.isatty():\r

3206

print ("This is a binary format - not writing to terminal.\nPlease direct output to a file or pipe.")\r

3207

return\r

3208

clause_sep = checkSetting(format,"clause_separator","\n")\r

3209

if type(clause_sep) in [bytes,unicode]: getBuf(sys.stdout).write(as_utf8(clause_sep).join(as_utf8(wordSeparator(format)).join(markup_inline_word(format,word) for word in clause) for clause in clauses))\r

3210

else: clause_sep(clauses)\r

3211

def write_bbcmicro_phones(clauses):\r

3212

"""Special-case function set as clause_separator in bbcmicro format. Must be a special case because it needs to track any extra keystrokes to avoid "Line too long". And while we're at it, we might as well start a new *SPEAK command with each clause, using the natural brief delay between commands; this should minimise the occurrence of additional delays in arbitrary places. Also calls print_bbc_warnings"""\r

3213

totalKeystrokes = 0 ; lines = 0\r

3214

for clause in clauses:\r

3215

global bbc_charsSoFar ; bbc_charsSoFar=0\r

3216

l=as_utf8(" ").join([markup_inline_word("bbcmicro",word) for word in clause])\r

3217

getBuf(sys.stdout).write(l.replace(as_utf8(" \n"),as_utf8("\n")))\r

3218

totalKeystrokes += len(l)+1 ; lines += 1\r

3219

print_bbc_warnings(totalKeystrokes,lines)\r

3220

def print_bbc_warnings(keyCount,lineCount):\r

3221

"Print any relevant size warnings regarding sending 'keyCount' keys in 'lineCount' lines to the BBC Micro"\r

3222

sys.stdout.flush() # try to keep in sync if someone's doing 2>&1 | less\r

3223

limits_exceeded = [] ; severe=0\r

3224

if keyCount >= 32768:\r

3225

severe=1 ; limits_exceeded.append("BeebEm 32K keystroke limit") # At least in version 3, the clipboard is defined in beebwin.h as a char of size 32768 and its bounds are not checked. Additionally, if you script a second paste before the first has finished (or if you try to use BeebEm's Copy command) then the first paste will be interrupted. So if you really want to make BeebEm read more then I suggest setting a printer destination file, putting a VDU 2,10,3 after each batch of commands, and waiting for that \n to appear in that printer file before sending the next batch, or perhaps write a set of programs to a disk image and have them CHAIN each other or whatever.\r

3226

shadow_himem=0x8000 # if using a 'shadow mode' on the Master/B+/Integra-B (modes 128-135, which leave all main RAM free)\r

3227

mode7_himem=0x7c00 # (40x25 characters = 1000 bytes, by default starting at 7c00 with 24 bytes spare at the top, but the scrolling system uses the full 1024 bytes and can tell the video controller to start rendering at any one of them; if you get Jeremy Ruston's book and program the VIDC yourself then you could fix it at 7c18 if you really want, or just set HIMEM=&8000 and don't touch the screen, but that doesn't give you very much more room)\r

3228

default_speech_loc=0x5500\r

3229

overhead_per_program_line = 4\r

3230

for page,model in [\r

3231

(0x1900,"Model B"), # with Acorn DFS (a reasonable assumption although alternate DFS ROMs are different)\r

3232

(0xE00,"Master")]: # (the Master has 8k of special paged-in "filing system RAM", so doesn't need 2816 bytes of main RAM for DFS)\r

3233

top = page+keyCount+lineCount*(overhead_per_program_line-1)+2 # the -1 is because keyCount includes a carriage return at the end of each line\r

3234

if model=="Master": x=" (use Speech's Sideways RAM version instead, e.g. *SRLOAD SP8000 8000 7 and reset, but sound quality might be worse)" # I don't know why but SP8000 can play higher and more distorted than SPEECH, at least on emulation (and changing the emulation speed doesn't help, because that setting, at least in BeebEm3, just controls extra usleep every frame; it doesn't actually slow down the 6502 *between* frames; anyway timing of sound changes is done by CyclesToSamples stuff in beebsound.cc's SoundTrigger). If on the Master you go into View (*WORD) and then try SP8000, it plays _lower_ than *SPEECH (even if you do *BASIC first) and *SAY can corrupt a View document; ViewSheet (*SHEET) doesn't seem to have this effect; neither does *TERMINAL but *SAY can confuse the terminal.\r

3235

# Re bank numbers, by default banks 4 to 7 are Sideways RAM (4*16k=64k) and I suppose filling up from 7 makes sense because banks 8-F are ROMs (ANFS,DFS,ViewSheet,Edit,BASIC,ADFS,View,Terminal; OS is a separate 16k so there's scope for 144k of supplied ROM). Banks 0-3 are ROM expansion slots. The "128" in the name "Master 128" comes from 32k main RAM, 64k Sideways RAM, 20k shadow RAM (for screen modes 128-135), 4k OS "private RAM" (paged on top of 8000-8FFF) and 8k filing system RAM (paged on top of C000-DFFF) = 128k. Not sure what happened on the B+.\r

3236

# By the way BeebEm's beebsound.cc also shows us why SOUND was always out of tune especially in the higher pitches. The 16-bit freqval given to the chip is 125000/freq and must be an integer, so the likely temperament in cents for non-PCM is given by [int(math.log(125000.0/math.ceil(125000/freq)/freq,2**(1.0/1200))) for freq in [440*((2**(1.0/12))**semi) for semi in range(-12*3+2,12*2+6)]] (the actual temperament will depend on the OS's implementation of mapping SOUND pitch values to freqval's, unless you program the chip directly, but this list is indicative and varies over 10% in the top 2 octaves)\r

3237

# Some other ROMs (e.g. Alan Blundell's "Informant" 1989) seem to result in a crash after the *SPEECH and/or *SPEAK commands complete, at least in some emulator configurations; this may or may not be resolved via timing adjustments or adjustments in the ROM order; not sure exactly what the problem is\r

3238

else: x=" (Speech program will be overwritten unless relocated)" # (could use Sideways RAM for it instead if you have it fitted, see above)\r

3239

if top > default_speech_loc: limits_exceeded.append("%s TOP=&%X limit%s" % (model,default_speech_loc,x)) # The Speech program does nothing to stop your program (or its variables etc) from growing large enough to overwrite &5500, nor does it stop the stack pointer (coming down from HIMEM) from overwriting &72FF. For more safety on a Model B you could use RELOCAT to put Speech at &5E00 and be sure to set HIMEM=&5E00 before loading, but then you must avoid commands that change HIMEM, such as MODE (but selecting any non-shadow mode other than 7 will overwrite Speech anyway, although if you set the mode before loading Speech then it'll overwrite screen memory and still work as long as the affected part of the screen is undisturbed). You can't do tricks like ditching the lexicon because RELOCAT won't let you go above 5E00 (unless you fix it, but I haven't looked in detail; if you can fix RELOCAT to go above 5E00 then you can create a lexicon-free Speech by taking the 1st 0x1560 bytes of SPEECH and append two * bytes, relocate to &6600 and set HIMEM, but don't expect *SAY to work, unless you put a really small lexicon into the spare 144 bytes that are left - RELOCAT needs an xx00 address so you can't have those bytes at the bottom). You could even relocate to &6A00 and overwrite (non-shadow) screen memory if you don't mind the screen being filled with gibberish that you'd better not erase! (well if you program the VIDC as mentioned above and you didn't re-add a small lexicon then you could get yourself 3.6 lines of usable Mode 7 display from the spare bytes but it's probably not worth the effort)\r

3240

if top > mode7_himem:\r

3241

if model=="Master":\r

3242

if top > shadow_himem: limits_exceeded.append(model+" 32k HIMEM limit (even for shadow modes)") # TODO: maybe add instructions for using BAS128 on the B+ or Master; this sets PAGE=&10000 and HIMEM=&20000 (i.e. 64k for programs), which uses all 4 SRAM slots so you can't use SP8000 (unless it's on a real ROM); if using Speech in main memory you need to RELOCAT it to leave &3000 upwards for Bas128 code; putting it at &1900 for B+/DFS leaves you only 417 bytes for lexicon (which might not matter if you're using only *SPEECH: just create a shortened lexicon); putting it at &E00 for Master allows space for the default 2204-byte lexicon with 1029 bytes to spare; TODO check if Bas128 uses any workspace between &E00 and &3000 though. Alternatively (if you really want to store such a long program on the BBC) then you'd better split it into several programs that CHAIN each other (as mentioned above).\r

3243

else: limits_exceeded.append(model+" Mode 7 HIMEM limit (use shadow modes 128-135)")\r

3244

else: limits_exceeded.append(model+" Mode 7 HIMEM limit") # unless you overwrite the screen (see above) - let's assume the Model B hasn't been fitted with shadow modes (although the Integra-B add-on does give them to the Model B, and leaves PAGE at &1900; B+ has shadow modes but I don't know what's supposed to happen to PAGE on it). 65C02 Tube doesn't help much (it'll try to run Speech on the coprocessor instead of the host, and this results in silence because it can't send its sound back across the Tube; don't know if there's a way to make it run on the host in these circumstances or what the host's memory map is like)\r

3245

if lineCount > 32768: limits_exceeded.append("BBC BASIC line number limit") # and you wouldn't get this far without filling the memory, even with 128k (4 bytes per line)\r

3246

elif 10*lineCount > 32767: limits_exceeded.append("AUTO line number limit (try AUTO 0,1)") # (default AUTO increments in steps of 10; you can use AUTO 0,1 to start at 0 and increment in steps of 1. BBC BASIC stores its line info in a compact form which allows a range of 0-32767.)\r

3247

if severe: warning,after="WARNING: ",""\r

3248

else: warning,after="Note: ","It should still work if pasted into BeebEm as immediate commands. "\r

3249

after = ". "+after+"See comments in lexconvert for more details.\n"\r

3250

if len(limits_exceeded)>1: sys.stderr.write(warning+"this text may be too big for the BBC Micro. The following limits were exceeded: "+", ".join(limits_exceeded)+after)\r

3251

elif limits_exceeded: sys.stderr.write(warning+"this text may be too big for the BBC Micro because it exceeds the "+limits_exceeded[0]+after)\r

3252

def bbc_prepDefaultLex(outFile):\r

3253

"""Special-case function set as lex_header in bbcmicro format. If SPEECH_DISK and MAKE_SPEECH_ROM is set, then read the ROM code from SPEECH_DISK and write to outFile (meant to go before the lexicon, to make a modified BBC Micro Speech ROM with custom lexicon)"""\r

3254

if not os.environ.get("MAKE_SPEECH_ROM",0): return\r

3255

sd = open(os.environ['SPEECH_DISK'])\r

3256

d=getBuf(sd).read() # if this fails, SPEECH_DISK was not set or was set incorrectly (it's required for MAKE_SPEECH_ROM)\r

3257

i=d.index(as_utf8('LO')+chr(0x80)+as_utf8('LP')+chr(0x80)+chr(0x82)+chr(0x11)) # start of SP8000 file (if this fails, it wasn't a Speech disk)\r

3258

j=d.index(as_utf8('>OUS_'),i) # start of lexicon (ditto)\r

3259

assert j-i==0x1683, "Is this really an original disk image?"\r

3260

getBuf(outFile).write(d[i:j])\r

3261

def bbc_appendDefaultLex(outFile):\r

3262

"""Special-case function set as lex_footer in bbcmicro format. If SPEECH_DISK is set, read Speech's default lexicon from it and append this to outFile. Otherwise just write a terminating >** to outFile. In either case, check for exceeding 16k if we're MAKE_SPEECH_ROM, close the file and call print_bbclex_instructions."""\r

3263

if os.environ.get("SPEECH_DISK",""):\r

3264

sd = open(os.environ['SPEECH_DISK'])\r

3265

d=getBuf(sd).read()\r

3266

i=d.index(as_utf8('>OUS_')) # if this fails, it wasn't a Speech disk\r

3267

j=d.index(as_utf8(">**"),i)\r

3268

assert j-i==2201, "Lexicon on SPEECH_DISK is wrong size (%d). Is this really an original disk image?" % (j-i)\r

3269

getBuf(outFile).write(d[i:j])\r

3270

# TODO: can we compress the BBC lexicon? i.e. detect if a rule will happen anyway due to subsequent wildcard rules, and delete it if so (don't know how many bytes that would save)\r

3271

outFile.write(">**")\r

3272

fileLen = outFile.tell()\r

3273

assert not os.environ.get("MAKE_SPEECH_ROM",0) or fileLen <= 16384, "Speech ROM file got too big (%d)" % fileLen\r

3274

outFile.close()\r

3275

print_bbclex_instructions(getSetting("bbcmicro","lex_filename"),fileLen)\r

3276

\r

3277

def bbcshortest(n):\r

3278

"""Convert integer n into the shortest possible number of BBC Micro keystrokes; prefer hex if and only if the extra '&' keystroke won't make it any longer than its decimal equivalent"""\r

3279

if len(str(n)) < len('&%X'%n): return as_utf8(str(n))\r

3280

else: return as_utf8('&%X'%n)\r

3281

def bbcKeystrokes(data,start):\r

3282

"Return BBC BASIC keystrokes to put data into RAM starting at address start, without using the BASIC heap in the process (although we do use one of the page-4 integer variables to save some keystrokes). Assumes the data is mostly ASCII so the $ operator is the least-keystrokes method of getting it in (rather than ? and ! operators, assembler EQUB/EQUW/EQUS, 6502 mnemonics, etc); we don't mind about overwriting the byte after with a CHR$(13). Keystrokes are limited to ASCII for easier copy/paste. See comments for more details."\r

3283

# Taken to the extreme, a 'find the least keystrokes' function would be some kind of data compressor; we're not doing that here as we assume this is going to be used to poke in a lexicon, which is basically ASCII with a few CHR$(128)s thrown in; this '$ operator' method is highly likely to yield the least keystrokes for that kind of data, apart from setting and using temporary string variables, but then (1) you're in the realms of data compression and (2) you require heap memory, which might not be a good idea depending on where we're putting our lexicon.\r

3284

# I suppose it wouldn't hurt in most cases to have an A$=CHR$(128), but not doing this for now because you might be in a situation where you can't touch the heap at all (I'm not sure where the workspace for assembling strings is though).\r

3285

# However, just to be pedantic about saving a few bytes, there is one thing we CAN do: if we have a lexicon with a lot of CHR$(128)s in it, let's set up BASIC's page-4 integer variables such that $A%=CHR$(128), saving 6 keystrokes per entry without needing the heap (an additional 1 keystroke per entry could be saved if we didn't mind putting an A$ on the heap).\r

3286

use_int_hack = ((start>=1030 or start+len(data)<=1026) and len(data.split(chr(128))) >= 4)\r

3287

i=0 ; ret=[]\r

3288

if use_int_hack: thisLine = as_utf8("A%=&408:B%=&D80:") # (@% is at &400 and each is 4 byte LSB-MSB; $x reads to next 0D)\r

3289

# (If we're guaranteed to NOT be using Bas128 and therefore all memory addresses are effectively masked by &FFFF, we can instead set A%=&D800406 (using A%'s low 2 bytes to point to A%'s high 2 bytes) for a 1-off saving of 5 keystrokes and 1 page-4 variable, but this saving is not really worth the readability compromise and the risk posed by the possibility of Bas128 - I don't know how Bas128 treats addresses above &1FFFF)\r

3290

# (An even 'nastier' trick would be to put !13=&D80 and then use $13, as those bytes are used by BASIC's random number generator, which presumably isn't called during the paste and we don't mind disrupting it; again I don't know about Bas128. But you can't do it because BASIC gives a "$ range" error on anything below 256.)\r

3291

# (I suppose one thing you _could_ do is LOMEM=&400:A$=CHR$(13) and end with LOMEM=TOP, which would overwrite 3 page-4 variables and let you use just A$ instead of $A%, saving keystrokes over A%=&D800406 after 21 more lexicon words, at the expense of losing track of any variables you had on the heap. But this is getting silly.)\r

3292

else: thisLine = as_utf8("")\r

3293

bbc_max_line_len = 238\r

3294

inQuote=needPlus=0 ; needCmd=1\r

3295

while i<len(data):\r

3296

if needCmd:\r

3297

thisLine += (as_utf8('$')+bbcshortest(start)+as_utf8('='))\r

3298

inQuote=needPlus=needCmd=0\r

3299

if data[i:i+1]==as_utf8('"'): c,inQ = as_utf8('""'),1 # inQ MUST be 0 or 1, not False/True, because it's also used as 'len of necessary close quote' below\r

3300

elif 32<=ord(data[i:i+1])<127: c,inQ = data[i:i+1],1\r

3301

elif use_int_hack and ord(data[i:i+1])==128: c,inQ=as_utf8("$A%"),0\r

3302

else: c,inQ=(as_utf8("CHR$("+str(ord(data[i:i+1]))+")")),0\r

3303

addToLine = [] ; newNeedPlus = needPlus\r

3304

if inQ and not inQuote:\r

3305

if needPlus: addToLine.append(as_utf8('+'))\r

3306

addToLine.append(as_utf8('"'))\r

3307

newNeedPlus=0\r

3308

elif inQuote and not inQ:\r

3309

addToLine.append(as_utf8('"+'))\r

3310

newNeedPlus=1 # after what we'll add\r

3311

elif not inQ:\r

3312

if needPlus: addToLine.append(as_utf8('+'))\r

3313

newNeedPlus=1 # after what we'll add\r

3314

addToLine.append(c)\r

3315

addToLine=as_utf8('').join(addToLine)\r

3316

if len(thisLine)+len(addToLine)+inQ > bbc_max_line_len: # oops, we've gone too far, back off and end prev line\r

3317

if inQuote: thisLine += as_utf8('"')\r

3318

ret.append(thisLine)\r

3319

thisLine=as_utf8("") ; needCmd=1 ; continue\r

3320

thisLine += addToLine ; inQuote=inQ\r

3321

needPlus=newNeedPlus ; i += 1 ; start += 1\r

3322

if inQuote: thisLine += as_utf8('"')\r

3323

if not needCmd: ret.append(thisLine)\r

3324

return as_utf8('\n').join(ret)+as_utf8('\n')\r

3325

def print_bbclex_instructions(fname,size):\r

3326

"""Print suitable instructions for a BBC Micro lexicon of the given filename and size (the exact nature of the instructions depends on the size). If appropriate, create a .key file containing keystrokes for transferring to an emulator."""\r

3327

if os.environ.get("MAKE_SPEECH_ROM",0): print ("%s (%d bytes, hex %X) can now installed on an emulator (set in Roms.cfg or whatever), or loaded onto a chip. The sound quality of this might be worse than that of the main-RAM version." % (fname,size,size)) # (at least on emulation - see comment on sound quality above)\r

3328

else:\r

3329

print ("The size of this lexicon is %d bytes (hex %X)" % (size,size)) # (the default lexicon is 2204 bytes)\r

3330

bbcStart=None\r

3331

noSRAM_lex_offset=0x155F # (on the BBC Micro, SRAM means Sideways RAM, not Static RAM as it does elsewhere; for clarity we'd better say "Sideways RAM" in all output)\r

3332

SRAM_lex_offset=0x1683\r

3333

SRAM_max=0x4000 # 16k\r

3334

noSRAM_default_addr=0x5500\r

3335

noSRAM_min_addr=0xE00 # minimum supported by RELOCAT\r

3336

page=0x1900 # or 0xE00 for Master (but OK to just leave this at 0x1900 regardless of model; it harmlessly increases the range where special_relocate_instructions 'kick in')\r

3337

noSRAM_himem=0x7c00 # unless you're in a shadow mode or something (see comments on himem above), however leaving this at 0x7c00 is usually harmless (just causes the 'need to relocate' to 'kick in' earlier, although if memory is really full it might say 'too big' 1k too early)\r

3338

def special_relocate_instructions(reloc_addr):\r

3339

pagemove_min,pagemove_max = max(0xE00,page-0x1E00), page+0xE00 # if relocating to within this range, must move PAGE before loading RELOCAT. RELOCAT's supported range is 0xE00 to 0x5E00, omitting (PAGE-&1E00) to (PAGE+&E00)\r

3340

if reloc_addr < 0x1900: extra=" On a Model B with Acorn DFS you won't be able to use the disk after relocating below &1900, and you can't run star commands from tape so you have to initialise via CALL. (On a Master, DFS is not affected as it doesn't use &E00-&1900.)"\r

3341

else: extra = ""\r

3342

if not pagemove_min<=reloc_addr<pagemove_max:\r

3343

return extra # no other special instructions needed\r

3344

newpage = reloc_addr+0x1E00\r

3345

page_max = min(0x5E00,noSRAM_default_addr-0xE00)\r

3346

if newpage > page_max: return False # "Unfortunately RELOCAT can't put it at &%X even with PAGE changes." % reloc_addr\r

3347

return " Please run RELOCAT with PAGE in the range of &%X to &%X for this relocation to work.%s" % (newpage,page_max,extra)\r

3348

if noSRAM_default_addr+noSRAM_lex_offset+size > noSRAM_himem:\r

3349

reloc_addr = noSRAM_himem-noSRAM_lex_offset-size\r

3350

reloc_addr -= (reloc_addr%256)\r

3351

if reloc_addr >= noSRAM_min_addr:\r

3352

instr = special_relocate_instructions(reloc_addr)\r

3353

if instr==False: print ("This lexicon is too big for Speech in main RAM even with relocation, unless RELOCAT is rewritten to work from files.")\r

3354

else:\r

3355

bbcStart = reloc_addr+noSRAM_lex_offset\r

3356

reloc_call = reloc_addr + 0xB00\r

3357

print ("This lexicon is too big for Speech at its default address of &%X, but you could use RELOCAT to put a version at &%X and then initialise it with CALL %s (or do the suggested *SAVE, reset, and run *SP). Be sure to set HIMEM=&%X. Then *LOAD %s %X or change the relocated SP file from offset &%X.%s" % (noSRAM_default_addr,reloc_addr,bbcshortest(reloc_call),reloc_addr,fname,bbcStart,noSRAM_lex_offset,instr))\r

3358

else: print ("This lexicon is too big for Speech in main RAM even with relocation.")\r

3359

else: # fits at default location - no relocation needed\r

3360

bbcStart = noSRAM_default_addr+noSRAM_lex_offset\r

3361

print ("You can load this lexicon by *LOAD %s %X or change the SPEECH file from offset &%X. Suggest you also set HIMEM=&%X for safety." % (fname,bbcStart,noSRAM_lex_offset,noSRAM_default_addr))\r

3362

if bbcStart: # we managed to fit it into main RAM\r

3363

f = open(fname)\r

3364

keys = bbcKeystrokes(getBuf(f).read(),bbcStart)\r

3365

f = open(fname+".key","w")\r

3366

getBuf(f).write(keys)\r

3367

del f\r

3368

print ("For ease of transfer to emulators etc, a self-contained keystroke file for putting %s data at &%X has been written to %s.key" % (fname,bbcStart,fname))\r

3369

if len(keys) > 32767: print ("(This file looks too big for BeebEm to paste though)") # see comments elsewhere\r

3370

# Instructions for replacing lex in SRAM:\r

3371

if size > SRAM_max-SRAM_lex_offset: print ("This lexicon is too big for Speech in Sideways RAM.") # unless you can patch Speech to run in SRAM but read its lexicon from main RAM, or run in main RAM but page in multiple banks of SRAM for the lexicon (but even then there'll be a limit)\r

3372

else: print ("You can load this lexicon into Sideways RAM by *SRLOAD %s %X 7 (or whichever bank number you're using), or change the SP8000 file from offset &%X." % (fname,SRAM_lex_offset+0x8000,SRAM_lex_offset))\r

3373

if not os.environ.get("SPEECH_DISK",""): print ("If you want to append the default lexicon to this one, set SPEECH_DISK to the image of the original Speech disk before running lexconvert, e.g. export SPEECH_DISK=/usr/local/BeebEm3/diskimg/Speech.ssd")\r

3374

if size <= SRAM_max-SRAM_lex_offset: print ("You can also set MAKE_SPEECH_ROM=1 (along with SPEECH_DISK) to create a SPEECH.ROM file instead")\r

3375

print ("If you get 'Mistake in speech' when testing some words, try starting with '*SAY, ' (this seems to be a Speech bug)") # - can't track down which words it does and doesn't apply to\r

3376

print ("It might be better to load your lexicon into eSpeak and use lexconvert's --phones option to drive the BBC with phonemes.")\r

3377

\r

3378

def mainopt_version(i):\r

3379

# TODO: doc string for the help? (or would this option clutter it needlessly) - just print lexconvert's version number and nothing else\r

3380

print (__doc__.split("\n")[0].split(" - ")[0])\r

3381

\r

3382

def main():\r

3383

"""Introspect the module to find the mainopt_ functions, and either call one of them or print the help. Returns the error code to send back to the OS."""\r

3384

def funcToOpt(n): return "--"+n[n.index("_")+1:].replace("_","-")\r

3385

for k,v in globals().items():\r

3386

if k.startswith('mainopt_') and funcToOpt(k) in sys.argv:\r

3387

try: msg = v(sys.argv.index(funcToOpt(k)))\r

3388

except Message:\r

3389

# Python 2.6+ can have "except Message as e",\r

3390

# but Python 2.5 has to have "except Message,e"\r

3391

# which is disallowed in Python 3, so\r

3392

msg=sys.exc_info()[1].message\r

3393

if msg:\r

3394

sys.stdout.flush()\r

3395

sys.stderr.write(msg+"\n") ; return 1\r

3396

else: return 0\r

3397

html = ('--htmlhelp' in sys.argv) # (undocumented option used for my website, don't rely on it staying)\r

3398

def htmlify(h): return re.sub('(--[2A-Za-z-]*)',r'<kbd>\1</kbd>',h.replace('&','&').replace('<','<').replace('>','>').replace('\n','<br>'))\r

3399

if not html: htmlify = lambda x:x\r

3400

print (htmlify(__doc__))\r

3401

if html: missALine = "<p>"\r

3402

else: missALine = ""\r

3403

print (missALine)\r

3404

if '--formats' in sys.argv: # non-HTML mode only (format descriptions are included in HTML anyway, and don't worry about the capability summary)\r

3405

print ("Available pronunciation formats (and support levels):")\r

3406

keys=list(lexFormats.keys()) ; keys.sort()\r

3407

for k in keys:\r

3408

types = []\r

3409

if not k=="example": types.append("phones")\r

3410

if k=="mac-uk": types.append("speaking")\r

3411

else:\r

3412

if checkSetting(k,"lex_read_function"): types.append("lex-read")\r

3413

if checkSetting(k,"lex_filename") and checkSetting(k,"lex_entry_format"):\r

3414

ltype = checkSetting(k,"lex_type")\r

3415

if ltype: ltype=" as "+ltype\r

3416

types.append("lex-write"+ltype)\r

3417

print ("\n"+k+" ("+", ".join(types)+")")\r

3418

print (getSetting(k,"doc"))\r

3419

return 0\r

3420

elif html:\r

3421

print ("Available pronunciation formats:")\r

3422

if html: print ('<table id="formats">')\r

3423

keys=list(lexFormats.keys()) ; keys.sort()\r

3424

for k in keys: print ('<tr><td valign="top"><nobr>'+k+'</nobr></td><td valign="top">'+htmlify(getSetting(k,"doc"))+"</td></tr>")\r

3425

print ("</table><script></script>")\r

3426

else: print ("Available pronunciation formats: "+", ".join(sorted(list(lexFormats.keys())))+"\n(Use --formats to see their descriptions)")\r

3427

print (missALine)\r

3428

print ("Program options:")\r

3429

print (missALine)\r

3430

if html: print ("<dl>")\r

3431

for _,opt,desc in sorted([(not not v.__doc__ and not v.__doc__.startswith('*'),k,v.__doc__) for k,v in globals().items()]):\r

3432

if not opt.startswith("mainopt_"): continue\r

3433

opt = funcToOpt(opt)\r

3434

if not desc: continue # undocumented option\r

3435

params,rest = desc.split("\n",1)\r

3436

if params.startswith('*'): params=params[1:]\r

3437

if params: opt += (' '+params)\r

3438

if html: print ("<dt>"+htmlify(opt)+"</dt><dd>"+htmlify(rest)+"</dd>")\r

3439

else: print (opt+"\n"+rest+"\n")\r

3440

if html: print ("</dl>")\r

3441

return 0\r

3442

\r

3443

catchingSigs = inSigHandler = False\r

3444

def catchSignals():\r

3445

"We had better try to catch all signals if using MacBritish_System_Lexicon so we can safely clean it up. We raise KeyboardInterrupt instead (need to catch this). Might not work with multithreaded code."\r

3446

global catchingSigs\r

3447

if catchingSigs: return\r

3448

catchingSigs = True\r

3449

import signal\r

3450

def f(sigNo,*args):\r

3451

global inSigHandler\r

3452

if inSigHandler: return\r

3453

inSigHandler = True\r

3454

os.killpg(os.getpgrp(),sigNo)\r

3455

sys.stderr.write("\nCaught signal %d\n" % sigNo)\r

3456

raise KeyboardInterrupt\r

3457

for n in xrange(1,signal.NSIG):\r

3458

if not n in [\r

3459

signal.SIGCHLD, # sent on subprocess completion\r

3460

signal.SIGTSTP,signal.SIGCONT, # Ctrl-Z / fg\r

3461

signal.SIGWINCH, # window-size change\r

3462

] and not signal.getsignal(n)==signal.SIG_IGN:\r

3463

try: signal.signal(n,f)\r

3464

except: pass\r

3465

class MacBritish_System_Lexicon(object):\r

3466

"""Overwrites some of the pronunciations in the system\r

3467

lexicon (after backing up the original). Cannot\r

3468

change the actual words in the system lexicon, so just\r

3469

alters pronunciations of words you don't intend to use\r

3470

so you can substitute these into your texts.\r

3471

Restores the lexicon on close()."""\r

3472

instances = {}\r

3473

def __init__(self,text="",voice="Daniel"):\r

3474

"""text is the text you want to speak (so that any\r

3475

words used in it that are not mentioned in your\r

3476

lexicon are unchanged in the system lexicon);\r

3477

text="" means you just want to speak phonemes.\r

3478

Special value of text=False means lexicon read only.\r

3479

voice can be Daniel, Emily or Serena."""\r

3480

self.voice = False\r

3481

if not text==False:\r

3482

assert not voice in MacBritish_System_Lexicon.instances, "There is already another instance of MacBritish_System_Lexicon for the "+voice+" voice"\r

3483

assert not os.system("lockfile -1 -r 10 /tmp/"+voice+".PCMWave.lock") # in case some other process has it (note: if you run with python -O, this check won't happen!)\r

3484

self.voice = voice # (don't set this if text==False, since we won't need cleanup on __del__)\r

3485

self.filename = "/System/Library/Speech/Voices/"+voice+".SpeechVoice/Contents/Resources/PCMWave"\r

3486

assert not (not os.path.exists(self.filename) and os.path.exists("/System/Library/Speech/Voices/"+voice+"Compact.SpeechVoice/Contents/Resources/PCMWave")), "The only installation of "+voice+" found on this system was the Compact one, which lexconvert does not yet support" # TODO: could try self.wordIndexStart = findW("Abiquiu"),self.phIndexStart = findW("'@b.Ik.ju"),self.wordIndexEnd = findW("www.youtube.com",1),self.phIndexEnd = findW("'d^b.l.ju.'d^b.l.ju.'d^b.l.ju.dA+t.'ju.'tjub.dA+t.kA+m",1), but "t" in phones should be ignored, "activesync" and "afterlife" have no phones, "aqua" has TWO sets of phonemes (aquarium ok) and there are other synchronization issues.\r

3487

# TODO: some sync issues persist even on the NON-Compact version in newer versions of macOS (e.g. 10.12). This currently leads to exceptions in findW on such systems (which do say it could be due to wrong version of the voice); fixing would need looking at more sync issues as above\r

3488

assert os.path.exists(self.filename),"Cannot find an installation of '"+voice+"' on this system"\r

3489

if os.path.exists(self.filename+"0"):\r

3490

if text==False: self.filename += "0" # (use the backup file for read-only, if we created one before; this means we don't have to worry about locks)\r

3491

elif not text==False: # create a backup\r

3492

sys.stderr.write("Backing up "+self.filename+" to "+self.filename+"0...\n") # (you'll need a password if you're not running as root)\r

3493

err = os.system("sudo mv \""+self.filename+"\" \""+self.filename+"0\"; sudo cp \""+self.filename+"0\" \""+self.filename+"\"; sudo chown "+str(os.getuid())+" \""+self.filename+"\"")\r

3494

assert not err, "Error creating backup"\r

3495

lexFile = self.filename+".lexdir"\r

3496

if not os.path.exists(lexFile) and not text==False:\r

3497

sys.stderr.write("Creating lexdir file...\n")\r

3498

err = os.system("sudo touch \""+lexFile+"\" ; sudo chown "+str(os.getuid())+" \""+lexFile+"\"")\r

3499

assert not err, "Error creating lexdir"\r

3500

compat_err = "\nThis probably means your Mac has a new version of the voice that is no longer compatible with this system-lexicon patch."\r

3501

import cPickle\r

3502

if os.path.exists(lexFile) and os.stat(lexFile).st_size: self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd = cPickle.Unpickler(open(lexFile)).load()\r

3503

else:\r

3504

f = open(self.filename)\r

3505

dat = getBuf(f).read()\r

3506

def findW(word,rtnPastEnd=0):\r

3507

i = re.finditer(re.escape(word+chr(0)),dat)\r

3508

try: n = i.next()\r

3509

except StopIteration: raise Exception(word+" not found in voice file"+compat_err)\r

3510

try:\r

3511

n2 = i.next()\r

3512

raise Exception("%s does not uniquely identify a byte position (has at least %d and %d)%s" % (word,n.start(),n2.start(),compat_err))\r

3513

except StopIteration: pass\r

3514

if rtnPastEnd: return n.end()\r

3515

else: return n.start()\r

3516

self.wordIndexStart = findW("808s")\r

3517

self.phIndexStart = findW("'e&It.o&U.e&Its")\r

3518

self.wordIndexEnd = findW("zombie",1)\r

3519

self.phIndexEnd = findW("'zA+m.bI",1)\r

3520

if not text==False: cPickle.Pickler(open(lexFile,"w")).dump((self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd))\r

3521

if text==False: self.dFile = open(self.filename)\r

3522

else: self.dFile = open(self.filename,'r+')\r

3523

assert len(self.allWords()) == len(self.allPh()), str(len(self.allWords()))+" words but "+str(len(self.allPh()))+" phonemes"+compat_err\r

3524

self.textToAvoid = u""\r

3525

if text==False: return\r

3526

MacBritish_System_Lexicon.instances[voice] = self\r

3527

self.textToAvoid = text.decode('utf-8').replace(unichr(160),' ') ; self.restoreDic = {}\r

3528

catchSignals()\r

3529

def allWords(self):\r

3530

"Returns a list of words that are defined in the system lexicon (which won't be changed, but see allPh)"\r

3531

self.dFile.seek(self.wordIndexStart)\r

3532

return [x for x in getBuf(self.dFile).read(self.wordIndexEnd-self.wordIndexStart).split(chr(0)) if x]\r

3533

def allPh(self):\r

3534

"Returns a list of (file position, phoneme string) for each of the primary phoneme entries from the system lexicon. These entries can be changed in-place by writing to the said file position, and then spoken by giving the voice the corresponding word from allWords (but see also usable_words)."\r

3535

self.dFile.seek(self.phIndexStart)\r

3536

def f(l):\r

3537

last = None ; r = [] ; pos = self.phIndexStart\r

3538

for i in l:\r

3539

if re.search(r'[ -~]',i) and not i in ["'a&I.'fo&Un","'lI.@n","'so&Un.j$"] and not (i==last and i in ["'tR+e&I.si"]): r.append((pos,i)) # (the listed pronunciations are secondary ones that for some reason are in the list)\r

3540

if re.search(r'[ -~]',i): last = i\r

3541

pos += (len(i)+1) # +1 for the \x00\r

3542

assert pos==self.phIndexEnd+1 # +1 because the last \00 will result in a "" item after; the above +1 will be incorrect for that item\r

3543

return r\r

3544

return f([x for x in getBuf(self.dFile).read(self.phIndexEnd-self.phIndexStart).split(chr(0))])\r

3545

def usable_words(self,words_ok_to_redefine=[]):\r

3546

"Returns a list of (word,phoneme_file_position,original_phonemes) by combining allWords with allPh, but omitting any words that don't seem 'usable' (for example words that contain spaces, since these lexicon entries don't seem to be actually used by the voice). Words that occur in self.textToAvoid are also considered non-usable, unless they also occur in words_ok_to_redefine (user lexicon)."\r

3547

for word,(pos,phonemes) in zip(self.allWords(),self.allPh()):\r

3548

if not re.match("^[a-z0-9]*$",word): continue # it seems words not matching this regexp are NOT used by the engine\r

3549

if not (phonemes and 32<ord(phonemes[:1])<127): continue # better not touch those, just in case\r

3550

if word in self.textToAvoid and not word in words_ok_to_redefine: continue\r

3551

yield word,pos,phonemes\r

3552

def check_redef(self,wordsAndPhonemes):\r

3553

"Diagnostic function to list on standard error the 'redefinitions' we want to make. wordsAndPhonemes is a list of (original system-lexicon word, proposed new phonemes). The old phonemes are also listed, fetched from allPh."\r

3554

aw = self.allWords() ; ap = 0\r

3555

for w,p in wordsAndPhonemes:\r

3556

w = w.lower()\r

3557

if not re.match("^[a-z0-9]*$",w): continue\r

3558

if not w in aw: continue\r

3559

if not ap:\r

3560

ap = self.allPh()\r

3561

sys.stderr.write("Warning: some words were already in system lexicon\nword\told\tnew\n")\r

3562

sys.stderr.write(w+"\t"+ap[aw.index(w)][1]+"\t"+p+"\n")\r

3563

def speakPhones(self,phonesList):\r

3564

"Speaks every phonetic word in phonesList"\r

3565

words = [str(x)+"s" for x in range(len(phonesList))]\r

3566

d = self.setMultiple(words,phonesList)\r

3567

msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')\r

3568

getBuf(msc).write(as_utf8(" ").join(d.get(w,as_utf8("")) for w in words))\r

3569

def readWithLex(self,lex):\r

3570

"Reads the text given in the constructor after setting up the lexicon with the given (word,phoneme) list"\r

3571

# self.check_redef(lex) # uncomment if you want to know about these\r

3572

textToPrint = u' '+self.textToAvoid+u' '\r

3573

tta = ' '+self.textToAvoid.replace(u'\u2019',"'").replace(u'\u2032','').replace(u'\u00b4','').replace(u'\u02b9','').replace(u'\u00b7','').replace(u'\u2014',' ')+' ' # (ignore pronunciation marks 2032 and b7 that might be in the text, but still print them in textToPrint; also normalise apostrophes but not in textToPrint, and be careful with dashes as lex'ing the word after a hyphen or em-dash won't work BUT we still want to support hyphenated words IN the lexicon, so em-dashes are replaced here and hyphens are included in nonWordBefore below)\r

3574

words2,phonemes2 = [],[] # keep only the ones actually used in the text (no point setting whole lexicon)\r

3575

nonWordBefore=r"(?i)(?<=[^A-Za-z"+chr(0)+"-])" # see below for why chr(0) is included, and see comment above for why hyphen is at the end; (?i) = ignore case\r

3576

nonWordAfter=r"(?=([^A-Za-z'"+unichr(0x2019)+"-]|['"+unichr(0x2019)+r"-][^A-Za-z]))" # followed by non-letter non-apostrophe, or followed by apostrophe non-letter (so not if followed by "'s", because the voice won't use our custom lex entry if "'s" is added to the lex'd word, TODO: automatically add "'s" versions to the lexicon via +s or +iz?) (also not if followed by hyphen-letters; hyphen before start is handled above, although TODO preceded by non-letter + hyphen might be OK)\r

3577

ttal = tta.lower()\r

3578

for ww,pp in lex:\r

3579

ww = ww.decode('utf-8') # so you can add words with accents etc (in utf-8) to the lexicon\r

3580

if ww.lower() in ttal and re.search(nonWordBefore+re.escape(ww)+nonWordAfter,tta):\r

3581

words2.append(ww) ; phonemes2.append(pp)\r

3582

for k,v in self.setMultiple(words2,phonemes2).iteritems():\r

3583

tta = re.sub(nonWordBefore+re.escape(k)+nonWordAfter,chr(0)+v,tta)\r

3584

textToPrint = re.sub(nonWordBefore+'('+u'[\u2032\u00b4\u02b9\u00b7]*'.join(re.escape(c) for c in k)+')'+nonWordAfter,chr(0)+r'\1'+chr(1),textToPrint)\r

3585

tta = tta.replace(chr(0),'')\r

3586

term = os.environ.get("TERM","")\r

3587

if ("xterm" in term or term=="screen") and sys.stdout.isatty(): # we can probably underline words (inverse is more widely supported than underline, e.g. should work even on an old Linux console in case someone's using that to control an OS X server, but there might be a *lot* of words, which wouldn't be very good in inverse if user needs dark background and inverse is bright. Unlike Annogen, we're dealing primarily with Latin letters.)\r

3588

import textwrap\r

3589

textwrap.len = lambda x: len(x.replace(chr(0),"").replace(chr(1),"")) # a 'hack' to make (at least the 2.x implementations of) textwrap ignore our chr(0) and chr(1) markers in their calculations. Relies on textwrap calling len().\r

3590

print (textwrap.fill(textToPrint,stdout_width_unix(),break_on_hyphens=False).encode('utf-8').replace(chr(0),"\x1b[4m").replace(chr(1),"\x1b[0m").strip()) # break_on_hyphens=False because we don't really want hyphenated NAMES to be split across lines, and anyway textwrap in (at least) Python 2.7 has a bug that sometimes causes a line breaks to be inserted before a syllable marker symbol like 'prime'\r

3591

# else don't print anything (saves confusion)\r

3592

msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')\r

3593

getBuf(msc).write(tta.encode('utf-8'))\r

3594

def setMultiple(self,words,phonemes):\r

3595

"Sets phonemes for words, returning dict of word to substitute word. Flushes file buffer before return."\r

3596

avail = [] ; needed = []\r

3597

for word,pos,phon in self.usable_words(words):\r

3598

avail.append((len(phon),word,pos,phon))\r

3599

for word,phon in zip(words,phonemes):\r

3600

needed.append((len(phon),word,phon))\r

3601

avail.sort() ; needed.sort() # shortest phon first\r

3602

i = 0 ; wDic = {} ; iDone=set() ; mustBeAlpha=True\r

3603

# mustBeAlpha: prefer alphabetical words, since\r

3604

# these can be capitalised at start of sentence\r

3605

# (the prosody doesn't always work if it isn't)\r

3606

for l,word,phon in needed:\r

3607

while avail[i][0] < l or (mustBeAlpha and not re.match(as_utf8("[A-Za-z]"),avail[i][1])) or i in iDone:\r

3608

i += 1\r

3609

if i==len(avail):\r

3610

if mustBeAlpha: # desperate situation: we HAVE to use the non-alphabetical slots now (ideally we should pick words that never occur at start of sentence for them, but this branch is hopefully a rare situation in practice)\r

3611

mustBeAlpha=False ; i=0; continue\r

3612

sys.stderr.write("Could not find enough lexicon slots!\n") # TODO: we passed 'words' to usable_words's words_ok_to_redefine - this might not be the case if we didn't find enough slots\r

3613

self.dFile.flush() ; return wDic\r

3614

iDone.add(i)\r

3615

_,wSubst,pos,oldPhon = avail[i] ; i += 1\r

3616

if avail[i][2] in self.restoreDic: oldPhon=None # shouldn't happen if setMultiple is called only once, but might be useful for small experiments in the Python interpreter etc\r

3617

self.set(pos,phon,oldPhon)\r

3618

wDic[word] = wSubst[:1].upper()+wSubst[1:] # always capitalise it so it can be used at start of sentence too (TODO: copy original capitalisation of each instance instead, in case it happens to come directly after a dotted abbreviation? although if it's something that's always capitalised anyway, e.g. most names, then this won't make any difference)\r

3619

self.dFile.flush() ; return wDic\r

3620

def set(self,phPos,val,old=None):\r

3621

"""Sets phonemes at position phPos to new value.\r

3622

Caller should flush the file buffer when done."""\r

3623

# print "Debugger: setting %x to %s" % (phPos,val)\r

3624

if old:\r