feat(unitproc):Add bkipas (IPA pronunciation tool)
[BK-2020-03.git] / unitproc / bkipas.d / lexconvert.py
1 #!/usr/bin/env python
2 # May be run with either Python 2 or Python 3
3
4 """lexconvert v0.32 - convert phonemes between different speech synthesizers etc
5 (c) 2007-20 Silas S. Brown. License: GPL"""
6
7 # Run without arguments for usage information
8
9 # This program is free software; you can redistribute it and/or modify
10 # it under the terms of the GNU General Public License as published by
11 # the Free Software Foundation; either version 3 of the License, or
12 # (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18
19 # Old versions of this code are being kept in the E-GuideDog SVN repository at
20 # http://svn.code.sf.net/p/e-guidedog/code/ssb22/lexconvert
21 # and on GitHub at https://github.com/ssb22/lexconvert
22 # and on GitLab at https://gitlab.com/ssb22/lexconvert
23 # and on Bitbucket https://bitbucket.org/ssb22/lexconvert
24 # and at https://gitlab.developers.cam.ac.uk/ssb22/lexconvert
25 # although some early ones are missing.
26
27 def Phonemes():
28 """Create phonemes by calling vowel(), consonant(),
29 variant() and other().
30
31 For the variants, if a particular variant does not
32 exist in the destination format then we will treat it
33 as equivalent to the last non-variant we created.
34
35 For anything else that does not exist in the
36 destination format, we will first try to break the
37 source's phoneme into parts (e.g. see the treatment
38 of opt_ol_as_in_gold by eSpeak and bbcmicro), and if
39 that still doesn't work then we drop a character
40 (warning depending on the source format's setting of
41 safe_to_drop_characters). makeDic does however warn
42 about any non-variant consonants, or non-variant
43 vowels that weren't marked optional, missing from a
44 format. """
45 a_as_in_ah = vowel()
46 _, var1_a_as_in_ah = variant()
47 _, var3_a_as_in_ah = variant()
48 _, var4_a_as_in_ah = variant()
49 _, var5_a_as_in_ah = variant()
50 a_as_in_apple = vowel()
51 u_as_in_but = vowel() # or the first part of un as in hunt
52 _, var1_u_as_in_but = variant()
53 o_as_in_orange = vowel()
54 _, var1_o_as_in_orange = variant()
55 _, var2_o_as_in_orange = variant()
56 o_as_in_now = vowel()
57 _, var1_o_as_in_now = variant()
58 a_as_in_ago = vowel()
59 _, var1_a_as_in_ago = variant()
60 e_as_in_herd = vowel()
61 _, ar_as_in_year = variant()
62 eye = vowel()
63 _, var1_eye = variant()
64 b = consonant()
65 ch = consonant()
66 d = consonant()
67 th_as_in_them = consonant()
68 e_as_in_them = vowel()
69 _, var1_e_as_in_them = variant()
70 a_as_in_air = vowel()
71 _, var1_a_as_in_air = variant()
72 _, var2_a_as_in_air = variant()
73 _, var3_a_as_in_air = variant()
74 _, var4_a_as_in_air = variant()
75 a_as_in_ate = vowel()
76 _, var1_a_as_in_ate = variant()
77 f = consonant()
78 g = consonant()
79 h = consonant()
80 i_as_in_it = vowel()
81 _, var1_i_as_in_it = variant()
82 _, var2_i_as_in_it = variant()
83 ear = vowel()
84 _, var1_ear = variant()
85 _, var2_ear = variant()
86 e_as_in_eat = vowel()
87 _, var1_e_as_in_eat = variant()
88 j_as_in_jump = consonant()
89 k = consonant()
90 _, opt_scottish_loch = variant()
91 l = consonant()
92 _, var1_l = variant()
93 m = consonant()
94 n = consonant()
95 ng = consonant()
96 o_as_in_go = vowel()
97 _, var1_o_as_in_go = variant()
98 _, var2_o_as_in_go = variant()
99 opt_ol_as_in_gold = opt_vowel() # see eSpeak / bbcmicro
100 oy_as_in_toy = vowel()
101 _, var1_oy_as_in_toy = variant()
102 p = consonant()
103 r = consonant()
104 _, var1_r = variant()
105 s = consonant()
106 sh = consonant()
107 t = consonant()
108 _, var1_t = variant()
109 th_as_in_think = consonant()
110 oor_as_in_poor = vowel()
111 _, var1_oor_as_in_poor = variant()
112 _, opt_u_as_in_pull = variant()
113 opt_ul_as_in_pull = opt_vowel() # see eSpeak / bbcmicro
114 oo_as_in_food = vowel()
115 _, var1_oo_as_in_food = variant()
116 _, var2_oo_as_in_food = variant()
117 close_to_or = vowel()
118 _, var1_close_to_or = variant()
119 _, var2_close_to_or = variant()
120 _, var3_close_to_or = variant()
121 v = consonant()
122 w = consonant()
123 _, var1_w = variant()
124 y = consonant()
125 z = consonant()
126 ge_of_blige_etc = consonant()
127 glottal_stop = other()
128 syllable_separator = other()
129 _, primary_stress = variant()
130 _, secondary_stress = variant()
131 text_sharp = other()
132 text_underline = other()
133 text_question = other()
134 text_exclamation = other()
135 text_comma = other()
136 ipa_colon = other() # for catching missed cases
137 del _ ; return locals()
138
139 def LexFormats():
140 """Makes the phoneme conversion tables of each format.
141 Each table has string to phoneme entries and phoneme
142 to string entries. The string to phoneme entries are
143 used when converting OUT of that format, and the
144 phoneme to string entries are used when converting IN
145 (so you can recognise phonemes you don't support and
146 convert them to something else). By default, a tuple
147 of the form (string,phoneme) will create entries in
148 BOTH directions; one-directional entries are created
149 via (string,phoneme,False) or (phoneme,string,False).
150 The makeDic function checks the keys are unique.
151
152 First parameter is always a description of the
153 format, then come the phoneme entries as described
154 above, then any additional settings:
155
156 stress_comes_before_vowel (default False means any
157 stress mark goes AFTER the affected vowel; set to
158 True if the format requires stress placed before)
159
160 word_separator (default same as phoneme_separator)
161 phoneme_separator (default " ")
162 clause_separator (default newline)
163
164 (For a special case, clause_separator can also be
165 set to a function. If that happens, the function
166 will be called whenever lexconvert needs to output
167 a list of (lists of words) in this format. See
168 bbcmicro for an example function clause_separator)
169
170 safe_to_drop_characters (default False, can be a
171 string of safe characters or True = all; controls
172 warnings when unrecognised characters are found)
173
174 approximate_missing (default False) - if True,
175 makeDic will attempt to compensate for missing
176 phonemes by approximating them to others, instead of
177 warning about them. This is useful for American codes
178 that can't cope with all the British English phonemes.
179 (Approximation is done automatically anyway in the
180 case of variant phonemes; approximate_missing adds in
181 some additional approximations - see comments in code)
182
183 cleanup_regexps (default none) - optional list of
184 (search,replace) regular expressions to "clean up"
185 after converting each word INTO this format
186 cleanup_func (default none) - optional special-case
187 function to pass result through after cleanup_regexps
188
189 cvtOut_regexps (default none) - optional list of
190 (search,replace) regular expressions to "clean up"
191 before starting to convert OUT of this format
192 cvtOut_func (default none) - optional special-case
193 function to pass through before any cvtOut_regexps
194
195 inline_format (default "%s") the format string for
196 printing a word with --phones or --phones2phones
197 (can be used to put markup around each word)
198 (can also be a function taking the phonetic word
199 and returning the resulting string, e.g. bbcmicro)
200
201 output_is_binary (default False) - True if the output
202 is almost certainly unsuitable for a terminal; will
203 cause lexconvert to refuse to print phonemes unless
204 its standard output is redirected to a file or pipe
205 (affects the --phones and --phones2phones options)
206
207 inline_header (default none) text to print first
208 when outputting from --phones or --phones2phones
209 inline_footer (default none) text to print last
210 inline_oneoff_header (default none) text to print
211 before inline_header on the first time only
212
213 lex_filename - filename of a lexicon file. If this
214 is not specified, there is no support for writing a
215 lexicon in this format: there can still be READ
216 support if you define lex_read_function to open the
217 lexicon by itself, but otherwise the format can be
218 used only with --phones and --phones2phones.
219
220 lex_entry_format - format string for writing each
221 (word, pronunciation) entry to the lexicon file.
222 This is also needed for lexicon-write support.
223
224 lex_header, lex_footer - optional strings to write
225 at the beginning and at the end of the lexicon file
226 (can also be functions that take the open file as a
227 parameter, e.g. for bbcmicro; lex_footer is
228 allowed to close the file if it needs to do
229 something with it afterwards)
230
231 lex_word_case - optional "upper" or "lower" to
232 force a particular case for lexicon words (not
233 pronunciations - they're determined by the table).
234 The default is to allow words to be in either case.
235
236 lex_type (default "") - used by the --formats
237 option when summarising the support for each format
238
239 lex_read_function - Python function to READ the
240 lexicon file and return a (word,phonemes) list.
241 If this is not specified, there's no read support
242 for lexicons in this format (but there can still be
243 write support - see above - and you can still use
244 --phones and --phones2phones). If lex_filename is
245 specified then this function will be given the open
246 file as a parameter. """
247
248 phonemes = Phonemes() ; globals().update(phonemes)
249 return { "festival" : makeDic(
250 "Festival's British voice",
251 ('0',syllable_separator),
252 ('1',primary_stress),
253 ('2',secondary_stress),
254 ('aa',a_as_in_ah),
255 ('a',a_as_in_apple),
256 ('uh',u_as_in_but),
257 ('o',o_as_in_orange),
258 ('au',o_as_in_now),
259 ('@',a_as_in_ago),
260 ('@@',e_as_in_herd),
261 ('ai',eye),
262 ('b',b),
263 ('ch',ch),
264 ('d',d),
265 ('dh',th_as_in_them),
266 ('e',e_as_in_them),
267 (ar_as_in_year,'@@',False),
268 ('e@',a_as_in_air),
269 ('ei',a_as_in_ate),
270 ('f',f),
271 ('g',g),
272 ('h',h),
273 ('i',i_as_in_it),
274 ('i@',ear),
275 ('ii',e_as_in_eat),
276 ('jh',j_as_in_jump),
277 ('k',k),
278 ('l',l),
279 ('m',m),
280 ('n',n),
281 ('ng',ng),
282 ('ou',o_as_in_go),
283 ('oi',oy_as_in_toy),
284 ('p',p),
285 ('r',r),
286 ('s',s),
287 ('sh',sh),
288 ('t',t),
289 ('th',th_as_in_think),
290 ('u@',oor_as_in_poor),
291 ('u',opt_u_as_in_pull),
292 ('uu',oo_as_in_food),
293 ('oo',close_to_or),
294 ('v',v),
295 ('w',w),
296 ('y',y),
297 ('z',z),
298 ('zh',ge_of_blige_etc),
299 lex_filename=ifset("HOME",os.environ.get("HOME","")+os.sep)+".festivalrc",
300 lex_entry_format="(lex.add.entry '( \"%s\" n %s))\n",
301 lex_header=";; -*- mode: lisp -*-\n(eval (list voice_default))\n",
302 lex_read_function = lambda *args:eval('['+getoutput("grep -vi parameter.set < ~/.festivalrc | grep -v '(eval' | sed -e 's/;.*//' -e 's/.lex.add.entry//' -e s/\"'\"'[(] *\"/[\"/' -e 's/\" [^ ]* /\",(\"/' -e 's/\".*$/&\"],/' -e 's/[()]/ /g' -e 's/ */ /g'")+']'),
303 safe_to_drop_characters=True, # TODO: really? (could instead give a string of known-safe characters)
304 cleanup_func = festival_group_stress,
305 ),
306
307 "example" : makeVariantDic(
308 "A small built-in example lexicon for testing when you don't have your full custom lexicon to hand. Use --convert to write it in one of the other formats and see if a synth can import it.",
309 lex_read_function = lambda *args: [
310 ("Shadrach","shei1drak"),
311 ("Meshach","mii1shak"),
312 ("Abednego","@be1dniigou"),
313 ], cleanup_func = None,
314 lex_filename=None, lex_entry_format=None, noInherit=True),
315
316 "festival-cmu" : makeVariantDic(
317 "American CMU version of Festival",
318 ('ae',a_as_in_apple),
319 ('ah',u_as_in_but),
320 ('ax',a_as_in_ago),
321 (o_as_in_orange,'aa',False),
322 ('aw',o_as_in_now),
323 ('er',e_as_in_herd), # TODO: check this one
324 ('ay',eye),
325 ('eh',e_as_in_them),
326 (ar_as_in_year,'er',False),
327 (a_as_in_air,'er',False),
328 ('ey',a_as_in_ate),
329 ('hh',h),
330 ('ih',i_as_in_it),
331 ('ey ah',ear),
332 ('iy',e_as_in_eat),
333 ('ow',o_as_in_go),
334 ('oy',oy_as_in_toy),
335 ('uh',oor_as_in_poor),
336 ('uw',oo_as_in_food),
337 ('ao',close_to_or),
338 ),
339
340 "espeak" : makeDic(
341 "eSpeak's default British voice", # but eSpeak's phoneme representation isn't always that simple, hence the regexps at the end
342 ('%',syllable_separator),
343 ("'",primary_stress),
344 (',',secondary_stress),
345 # TODO: glottal_stop? (in regional pronunciations etc)
346 ('A:',a_as_in_ah),
347 ('A@',a_as_in_ah,False),
348 ('A',var1_a_as_in_ah),
349 ('a',a_as_in_apple),
350 ('aa',a_as_in_apple,False),
351 ('a2',a_as_in_apple,False), # TODO: this is actually an a_as_in_apple variant in espeak; festival @1 is not in mrpa PhoneSet
352 ('&',a_as_in_apple,False),
353 ('V',u_as_in_but),
354 ('0',o_as_in_orange),
355 ('aU',o_as_in_now),
356 ('@',a_as_in_ago),
357 ('a#',a_as_in_ago,False), # (TODO: eSpeak sometimes uses a# in 'had' when in a sentence, and this doesn't always sound good on other synths; might sometimes want to convert it to a_as_in_apple; not sure what contexts would call for this though)
358 ('3:',e_as_in_herd),
359 ('3',var1_a_as_in_ago),
360 ('@2',a_as_in_ago,False),
361 ('@-',a_as_in_ago,False), # (eSpeak @- sounds to me like a shorter version of @, TODO: double-check the relationship between @ and @2 in Festival)
362 ('aI',eye),
363 ('aI2',eye,False),
364 ('aI;',eye,False),
365 ('aI2;',eye,False),
366 ('b',b),
367 ('tS',ch),
368 ('d',d),
369 ('D',th_as_in_them),
370 ('E',e_as_in_them),
371 (ar_as_in_year,'3:',False),
372 ('e@',a_as_in_air),
373 ('eI',a_as_in_ate),
374 ('f',f),
375 ('g',g),
376 ('h',h),
377 ('I',i_as_in_it),
378 ('I;',i_as_in_it,False),
379 ('i',i_as_in_it,False),
380 ('I2',var2_i_as_in_it,False),
381 ('I2;',var2_i_as_in_it,False),
382 ('i@',ear),
383 ('i@3',var2_ear),
384 ('i:',e_as_in_eat),
385 ('i:;',e_as_in_eat,False),
386 ('dZ',j_as_in_jump),
387 ('k',k),
388 ('x',opt_scottish_loch),
389 ('l',l),
390 ('L',l,False),
391 ('m',m),
392 ('n',n),
393 ('N',ng),
394 ('oU',o_as_in_go),
395 ('oUl',opt_ol_as_in_gold), # (espeak says "gold" in a slightly 'posh' way though) (if dest format doesn't have opt_ol_as_in_gold, it'll get o_as_in_go + the l)
396 ('OI',oy_as_in_toy),
397 ('p',p),
398 ('r',r),
399 ('r-',r,False),
400 ('s',s),
401 ('S',sh),
402 ('t',t),
403 ('T',th_as_in_think),
404 ('U@',oor_as_in_poor),
405 ('U',opt_u_as_in_pull),
406 ('@5',opt_u_as_in_pull,False),
407 ('Ul',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the U, then the l
408 ('u:',oo_as_in_food),
409 ('O:',close_to_or),
410 ('O@',var3_close_to_or),
411 ('o@',var3_close_to_or,False),
412 ('O',var3_close_to_or,False),
413 ('v',v),
414 ('w',w),
415 ('j',y),
416 ('z',z),
417 ('Z',ge_of_blige_etc),
418 lex_filename = "en_extra",
419 lex_entry_format = "%s %s\n",
420 lex_read_function = lambda lexfile: [x for x in [l.split()[:2] for l in lexfile.readlines()] if len(x)==2 and not '//' in x[0]],
421 lex_footer=lambda f:(f.close(),os.system("espeak --compile=en")), # see also a bit of special-case code in mainopt_convert
422 inline_format = "[[%s]]",
423 word_separator=" ",phoneme_separator="",
424 stress_comes_before_vowel=True,
425 safe_to_drop_characters="_: !",
426 cleanup_regexps=[
427 ("k'a2n","k'@n"),
428 ("ka2n","k@n"),
429 ("gg","g"),
430 ("@U","oU"), # (eSpeak uses oU to represent @U; difference is given by its accent parameters)
431 ("([iU]|([AO]:))@r$","\1@"),
432 ("([^e])@r",r"\1_remove_3"),("_remove_",""),
433 # (r"([^iU]@)l",r"\1L") # only in older versions of espeak (not valid in more recent versions)
434 ("rr$","r"),
435 ("3:r$","3:"),
436 ("%%+","%"),("^%",""),("%$",""),
437 # TODO: 'declared' & 'declare' the 'r' after the 'E' sounds a bit 'regional' (but pretty). but sounds incomplete w/out 'r', and there doesn't seem to be an E2 or E@
438 # TODO: consider adding 'g' to words ending in 'N' (if want the 'g' pronounced in '-ng' words) (however, careful of words like 'yankee' where the 'g' would be followed by a 'k'; this may also be a problem going into the next word)
439 ],
440 cvtOut_regexps = [
441 ("e@r$","e@"), ("e@r([bdDfghklmnNprsStTvwjzZ])",r"e@\1"), # because the 'r' is implicit in other synths (but DO have it if there's another vowel to follow)
442 ],
443 ),
444
445 "sapi" : makeDic(
446 "Microsoft Speech API (American English)",
447 ('-',syllable_separator),
448 ('1',primary_stress),
449 ('2',secondary_stress),
450 ('aa',a_as_in_ah),
451 ('ae',a_as_in_apple),
452 ('ah',u_as_in_but),
453 ('ao',o_as_in_orange),
454 ('aw',o_as_in_now),
455 ('ax',a_as_in_ago),
456 ('er',e_as_in_herd),
457 ('ay',eye),
458 ('b',b),
459 ('ch',ch),
460 ('d',d),
461 ('dh',th_as_in_them),
462 ('eh',e_as_in_them),
463 ('ey',var1_e_as_in_them),
464 (a_as_in_ate,'ey',False),
465 ('f',f),
466 ('g',g),
467 ('h',h), # Jan suggested 'hh', but I can't get this to work on Windows XP (TODO: try newer versions of Windows)
468 ('ih',i_as_in_it),
469 ('iy',e_as_in_eat),
470 ('jh',j_as_in_jump),
471 ('k',k),
472 ('l',l),
473 ('m',m),
474 ('n',n),
475 ('ng',ng),
476 ('ow',o_as_in_go),
477 ('oy',oy_as_in_toy),
478 ('p',p),
479 ('r',r),
480 ('s',s),
481 ('sh',sh),
482 ('t',t),
483 ('th',th_as_in_think),
484 ('uh',oor_as_in_poor),
485 ('uw',oo_as_in_food),
486 ('AO',close_to_or),
487 ('v',v),
488 ('w',w),
489 # ('x',var1_w), # suggested by Jan, but I can't get this to work on Windows XP (TODO: try newer versions of Windows)
490 ('y',y),
491 ('z',z),
492 ('zh',ge_of_blige_etc),
493 approximate_missing=True,
494 lex_filename="run-ptts.bat", # write-only for now
495 lex_header = "rem You have to run this file\nrem with ptts.exe in the same directory\nrem to add these words to the SAPI lexicon\n\n",
496 lex_entry_format='ptts -la %s "%s"\n',
497 inline_format = '<pron sym="%s"/>',
498 safe_to_drop_characters=True, # TODO: really?
499 ),
500
501 "cepstral" : makeDic(
502 "Cepstral's British English SSML phoneset",
503 ('0',syllable_separator),
504 ('1',primary_stress),
505 ('a',a_as_in_ah),
506 ('ae',a_as_in_apple),
507 ('ah',u_as_in_but),
508 ('oa',o_as_in_orange),
509 ('aw',o_as_in_now),
510 ('er',e_as_in_herd),
511 ('ay',eye),
512 ('b',b),
513 ('ch',ch),
514 ('d',d),
515 ('dh',th_as_in_them),
516 ('eh',e_as_in_them),
517 ('e@',a_as_in_air),
518 ('ey',a_as_in_ate),
519 ('f',f),
520 ('g',g),
521 ('h',h),
522 ('ih',i_as_in_it),
523 ('i',e_as_in_eat),
524 ('jh',j_as_in_jump),
525 ('k',k),
526 ('l',l),
527 ('m',m),
528 ('n',n),
529 ('ng',ng),
530 ('ow',o_as_in_go),
531 ('oy',oy_as_in_toy),
532 ('p',p),
533 ('r',r),
534 ('s',s),
535 ('sh',sh),
536 ('t',t),
537 ('th',th_as_in_think),
538 ('uh',oor_as_in_poor),
539 ('uw',oo_as_in_food),
540 ('ao',close_to_or),
541 ('v',v),
542 ('w',w),
543 ('j',y),
544 ('z',z),
545 ('zh',ge_of_blige_etc),
546 approximate_missing=True,
547 lex_filename="lexicon.txt",
548 lex_entry_format = "%s 0 %s\n",
549 lex_read_function = lambda lexfile: [(word,pronunc) for word, ignore, pronunc in [l.split(None,2) for l in lexfile.readlines()]],
550 lex_word_case = "lower",
551 inline_format = "<phoneme ph='%s'>p</phoneme>",
552 safe_to_drop_characters=True, # TODO: really?
553 cleanup_regexps=[(" 1","1"),(" 0","0")],
554 ),
555
556 "mac" : makeDic(
557 "approximation in American English using the [[inpt PHON]] notation of Apple's US voices",
558 ('=',syllable_separator),
559 ('1',primary_stress),
560 ('2',secondary_stress),
561 ('AA',a_as_in_ah),
562 ('aa',var5_a_as_in_ah),
563 ('AE',a_as_in_apple),
564 ('UX',u_as_in_but),
565 (o_as_in_orange,'AA',False),
566 ('AW',o_as_in_now),
567 ('AX',a_as_in_ago),
568 (e_as_in_herd,'AX',False), # TODO: is this really the best approximation?
569 ('AY',eye),
570 ('b',b),
571 ('C',ch),
572 ('d',d),
573 ('D',th_as_in_them),
574 ('EH',e_as_in_them),
575 ('EY',a_as_in_ate),
576 ('f',f),
577 ('g',g),
578 ('h',h),
579 ('IH',i_as_in_it),
580 ('IX',var2_i_as_in_it),
581 ('IY',e_as_in_eat),
582 ('J',j_as_in_jump),
583 ('k',k),
584 ('l',l),
585 ('m',m),
586 ('n',n),
587 ('N',ng),
588 ('OW',o_as_in_go),
589 ('OY',oy_as_in_toy),
590 ('p',p),
591 ('r',r),
592 ('s',s),
593 ('S',sh),
594 ('t',t),
595 ('T',th_as_in_think),
596 ('UH',oor_as_in_poor),
597 ('UW',oo_as_in_food),
598 ('AO',close_to_or),
599 ('v',v),
600 ('w',w),
601 ('y',y),
602 ('z',z),
603 ('Z',ge_of_blige_etc),
604 approximate_missing=True,
605 lex_filename="substitute.sh", # write-only for now
606 lex_type = "substitution script",
607 lex_header = "#!/bin/bash\n\n# I don't yet know how to add to the Apple US lexicon,\n# so here is a 'sed' command you can run on your text\n# to put the pronunciation inline:\n\nsed -E -e :S \\\n",
608 lex_entry_format=r" -e 's/(^|[^A-Za-z])%s($|[^A-Za-z[12=])/\1[[inpt PHON]]%s[[inpt TEXT]]\2/g'"+" \\\n",
609 # but /g is non-overlapping matches and won't catch 2 words in the lex right next to each other with only one non-alpha in between, so we put :S at start and tS at end to make the whole operation repeat until it hasn't done any more substitutions (hence also the exclusion of [, 1, 2 or = following a word so it doesn't try to substitute stuff inside the phonemes; TODO: assert the lexicon does not contain "inpt", "PHON" or "TEXT")
610 lex_footer = lambda f:(f.write(" -e tS\n"),f.close(),os.chmod("substitute.sh",493)), # 493 = 0755, but no way to specify octal that works on both Python 2.5 and Python 3 (0o works on 2.6+)
611 inline_format = "[[inpt PHON]]%s[[inpt TEXT]]",
612 word_separator=" ",phoneme_separator="",
613 safe_to_drop_characters=True, # TODO: really?
614 ),
615
616 "mac-uk" : makeDic(
617 "Scansoft/Nuance British voices in Mac OS 10.7+ (system lexicon editing required, see --mac-uk option)",
618 ('.',syllable_separator),
619 ("'",primary_stress),
620 (secondary_stress,'',False),
621 ('A',a_as_in_ah),
622 ('@',a_as_in_apple),
623 ('$',u_as_in_but),
624 (a_as_in_ago,'$',False),
625 ('A+',o_as_in_orange),
626 ('a&U',o_as_in_now),
627 ('E0',e_as_in_herd),
628 ('a&I',eye),
629 ('b',b),
630 ('t&S',ch),
631 ('d',d),
632 ('D',th_as_in_them),
633 ('E',e_as_in_them),
634 ('0',ar_as_in_year),
635 ('E&$',a_as_in_air),
636 ('e&I',a_as_in_ate),
637 ('f',f),
638 ('g',g),
639 ('h',h),
640 ('I',i_as_in_it),
641 ('I&$',ear),
642 ('i',e_as_in_eat),
643 ('d&Z',j_as_in_jump),
644 ('k',k),
645 ('l',l),
646 ('m',m),
647 ('n',n),
648 ('nK',ng),
649 ('o&U',o_as_in_go),
650 ('O&I',oy_as_in_toy),
651 ('p',p),
652 ('R+',r),
653 ('s',s),
654 ('S',sh),
655 ('t',t),
656 ('T',th_as_in_think),
657 ('O',oor_as_in_poor),
658 ('U',opt_u_as_in_pull),
659 ('u',oo_as_in_food),
660 (close_to_or,'O',False),
661 ('v',v),
662 ('w',w),
663 ('j',y),
664 ('z',z),
665 ('Z',ge_of_blige_etc),
666 # lex_filename not set (mac-uk code does not permanently save the lexicon; see --mac-uk option to read text)
667 lex_read_function = lambda *args:[(w,p) for w,_,p in MacBritish_System_Lexicon(False,os.environ.get("MACUK_VOICE","Daniel")).usable_words()],
668 inline_oneoff_header = "(mac-uk phonemes output is for information only; you'll need the --mac-uk or --trymac-uk options to use it)\n",
669 word_separator=" ",phoneme_separator="",
670 stress_comes_before_vowel=True,
671 safe_to_drop_characters=True, # TODO: really?
672 cleanup_regexps=[(r'o\&U\.Ol', r'o\&Ul')],
673 ),
674
675 "x-sampa" : makeDic(
676 "General X-SAMPA notation, contributed by Jan Weiss",
677 ('.',syllable_separator),
678 ('"',primary_stress),
679 ('%',secondary_stress),
680 ('A',a_as_in_ah),
681 (':',ipa_colon),
682 ('A:',var3_a_as_in_ah),
683 ('Ar\\',var4_a_as_in_ah),
684 ('a:',var5_a_as_in_ah),
685 ('{',a_as_in_apple),
686 ('V',u_as_in_but),
687 ('Q',o_as_in_orange),
688 (var1_o_as_in_orange,'A',False),
689 ('O',var2_o_as_in_orange),
690 ('aU',o_as_in_now),
691 ('{O',var1_o_as_in_now),
692 ('@',a_as_in_ago),
693 ('3:',e_as_in_herd),
694 ('aI',eye),
695 ('Ae',var1_eye),
696 ('b',b),
697 ('tS',ch),
698 ('d',d),
699 ('D',th_as_in_them),
700 ('E',e_as_in_them),
701 ('e',var1_e_as_in_them),
702 (ar_as_in_year,'3:',False),
703 ('E@',a_as_in_air),
704 ('Er\\',var1_a_as_in_air),
705 ('e:',var2_a_as_in_air),
706 ('E:',var3_a_as_in_air),
707 ('e@',var4_a_as_in_air),
708 ('eI',a_as_in_ate),
709 ('{I',var1_a_as_in_ate),
710 ('f',f),
711 ('g',g),
712 ('h',h),
713 ('I',i_as_in_it),
714 ('1',var1_i_as_in_it),
715 ('I@',ear),
716 ('Ir\\',var1_ear),
717 ('i',e_as_in_eat),
718 ('i:',var1_e_as_in_eat),
719 ('dZ',j_as_in_jump),
720 ('k',k),
721 ('x',opt_scottish_loch),
722 ('l',l),
723 ('m',m),
724 ('n',n),
725 ('N',ng),
726 ('@U',o_as_in_go),
727 ('oU',var2_o_as_in_go),
728 ('@}',var1_u_as_in_but),
729 ('OI',oy_as_in_toy),
730 ('oI',var1_oy_as_in_toy),
731 ('p',p),
732 ('r\\',r),
733 (var1_r,'r',False),
734 ('s',s),
735 ('S',sh),
736 ('t',t),
737 ('T',th_as_in_think),
738 ('U@',oor_as_in_poor),
739 ('Ur\\',var1_oor_as_in_poor),
740 ('U',opt_u_as_in_pull),
741 ('}:',oo_as_in_food),
742 ('u:',var1_oo_as_in_food),
743 (var2_oo_as_in_food,'u:',False),
744 ('O:',close_to_or),
745 (var1_close_to_or,'O',False),
746 ('o:',var2_close_to_or),
747 ('v',v),
748 ('w',w),
749 ('W',var1_w),
750 ('j',y),
751 ('z',z),
752 ('Z',ge_of_blige_etc),
753 lex_filename="acapela.txt",
754 lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: may be able to convert part-of-speech (NOUN etc) to/from some other formats e.g. Festival
755 lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],
756 # TODO: inline_format ?
757 word_separator=" ",phoneme_separator="",
758 safe_to_drop_characters=True, # TODO: really?
759 ),
760 "vocaloid" : makeVariantDic(
761 "X-SAMPA phonemes for Yamaha's Vocaloid singing synthesizer. Contributed by Lorenzo Gatti, who tested in Vocaloid 4 using two American English voices.",
762 ('-',syllable_separator),
763 (primary_stress,'',False), # not used by Vocaloid
764 (secondary_stress,'',False),
765 ('Q',a_as_in_ah),
766 (var3_a_as_in_ah,'Q',False),
767 (var4_a_as_in_ah,'Q',False),
768 (var5_a_as_in_ah,'Q',False),
769 ('O@',o_as_in_orange),
770 (var1_o_as_in_orange,'O@',False),
771 (var2_o_as_in_orange, 'O@',False),
772 ('@U',o_as_in_now),
773 ('@r',e_as_in_herd),
774 (var1_eye, 'aI',False),
775 ('e',e_as_in_them),
776 ('I@',ar_as_in_year),
777 ('e@',a_as_in_air),
778 (var1_a_as_in_air, 'e@',False),
779 (var2_a_as_in_air, 'e@',False),
780 (var3_a_as_in_air, 'e@',False),
781 (var4_a_as_in_air, 'e@',False),
782 (var1_a_as_in_ate, 'eI', False),
783 (var1_i_as_in_it, 'I',False),
784 (var1_ear, 'I@',False),
785 ('i:',e_as_in_eat),
786 (var1_e_as_in_eat, 'i:',False),
787 (var2_o_as_in_go, '@U', False),
788 ('V', var1_u_as_in_but),
789 (var1_oy_as_in_toy, 'OI',False),
790 ('r',r),
791 ('th',t),
792 (var1_oor_as_in_poor, '@U',False),
793 ('u:',oo_as_in_food),
794 (var1_oo_as_in_food, 'u:',False),
795 (var1_close_to_or,'O:',False),
796 (var2_close_to_or,'O:',False),
797 (var1_w, 'w', False),
798 lex_filename="vocaloid.txt",
799 phoneme_separator=" ",
800 noInherit=True
801 ),
802 "android-pico" : makeVariantDic(
803 'X-SAMPA phonemes for the default \"Pico\" voice in Android (1.6+, American), wrapped in Java code', # you could put en-GB instead of en-US, but it must be installed on the phone
804 ('A:',a_as_in_ah), # won't sound without the :
805 (var5_a_as_in_ah,'A:',False), # a: won't sound
806 ('@U:',o_as_in_go),
807 ('I',var1_i_as_in_it), # '1' won't sound
808 ('i:',e_as_in_eat), # 'i' won't sound
809 ('u:',oo_as_in_food), # }: won't sound
810 ('a_I',eye),('a_U',o_as_in_now),('e_I',a_as_in_ate),('O_I',oy_as_in_toy),(var1_oy_as_in_toy,'O_I',False),('o_U',var2_o_as_in_go),
811 cleanup_regexps=[(r'\\',r'\\\\'),('"','&quot;'),('::',':')],
812 lex_filename="",lex_entry_format="",
813 lex_read_function=None,
814 inline_oneoff_header=r'class Speak { public static void speak(android.app.Activity a,String s) { class OnInit implements android.speech.tts.TextToSpeech.OnInitListener { public OnInit(String s) { this.s = s; } public void onInit(int i) { mTts.speak(this.s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private String s; }; if(mTts==null) mTts=new android.speech.tts.TextToSpeech(a,new OnInit(s),"com.svox.pico"); else mTts.speak(s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private static android.speech.tts.TextToSpeech mTts = null; };'+'\n',
815 inline_header=r'Speak.speak(this,"<speak xml:lang=\"en-US\">',
816 inline_format=r'<phoneme alphabet=\"xsampa\" ph=\"%s\"/>',
817 clause_separator=r".\n", # note r"\n" != "\n"
818 inline_footer='</speak>");',
819 ),
820
821 "acapela-uk" : makeDic(
822 'Acapela-optimised X-SAMPA for UK English voices (e.g. "Peter"), contributed by Jan Weiss',
823 ('.',syllable_separator),('"',primary_stress),('%',secondary_stress), # copied from "x-sampa", not tested
824 ('A:',a_as_in_ah),
825 ('{',a_as_in_apple),
826 ('V',u_as_in_but),
827 ('Q',o_as_in_orange),
828 ('A',var1_o_as_in_orange),
829 ('O',var2_o_as_in_orange),
830 ('aU',o_as_in_now),
831 ('{O',var1_o_as_in_now),
832 ('@',a_as_in_ago),
833 ('3:',e_as_in_herd),
834 ('aI',eye),
835 ('A e',var1_eye),
836 ('b',b),
837 ('t S',ch),
838 ('d',d),
839 ('D',th_as_in_them),
840 ('e',e_as_in_them),
841 (ar_as_in_year,'3:',False),
842 ('e @',a_as_in_air),
843 ('e r',var1_a_as_in_air),
844 ('e :',var2_a_as_in_air),
845 (var3_a_as_in_air,'e :',False),
846 ('eI',a_as_in_ate),
847 ('{I',var1_a_as_in_ate),
848 ('f',f),
849 ('g',g),
850 ('h',h),
851 ('I',i_as_in_it),
852 ('1',var1_i_as_in_it),
853 ('I@',ear),
854 ('I r',var1_ear),
855 ('i',e_as_in_eat),
856 ('i:',var1_e_as_in_eat),
857 ('dZ',j_as_in_jump),
858 ('k',k),
859 ('x',opt_scottish_loch),
860 ('l',l),
861 ('m',m),
862 ('n',n),
863 ('N',ng),
864 ('@U',o_as_in_go),
865 ('o U',var2_o_as_in_go),
866 ('@ }',var1_u_as_in_but),
867 ('OI',oy_as_in_toy),
868 ('o I',var1_oy_as_in_toy),
869 ('p',p),
870 ('r',r),
871 ('s',s),
872 ('S',sh),
873 ('t',t),
874 ('T',th_as_in_think),
875 ('U@',oor_as_in_poor),
876 ('U r',var1_oor_as_in_poor),
877 ('U',opt_u_as_in_pull),
878 ('u:',oo_as_in_food),
879 ('O:',close_to_or),
880 (var1_close_to_or,'O',False),
881 ('v',v),
882 ('w',w),
883 ('j',y),
884 ('z',z),
885 ('Z',ge_of_blige_etc),
886 lex_filename="acapela.txt",
887 lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: part-of-speech (as above)
888 lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],
889 inline_format = "\\Prn=%s\\",
890 safe_to_drop_characters=True, # TODO: really?
891 ),
892
893 "cmu" : makeDic(
894 'format of the US-English Carnegie Mellon University Pronouncing Dictionary, contributed by Jan Weiss', # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
895 ('0',syllable_separator),
896 ('1',primary_stress),
897 ('2',secondary_stress),
898 ('AA',a_as_in_ah),
899 (var1_a_as_in_ah,'2',False),
900 (ipa_colon,'1',False),
901 ('AE',a_as_in_apple),
902 ('AH',u_as_in_but),
903 (o_as_in_orange,'AA',False),
904 ('AW',o_as_in_now),
905 (a_as_in_ago,'AH',False), # seems they don't use AX as festival-cmu does
906 ('ER',e_as_in_herd), # TODO: check this one
907 ('AY',eye),
908 ('B',b),
909 ('CH',ch),
910 ('D',d),
911 ('DH',th_as_in_them),
912 ('EH',e_as_in_them),
913 (ar_as_in_year,'ER',False),
914 (a_as_in_air,'ER',False),
915 ('EY',a_as_in_ate),
916 ('F',f),
917 ('G',g),
918 ('HH',h),
919 ('IH',i_as_in_it),
920 ('EY AH',ear),
921 ('IY',e_as_in_eat),
922 ('JH',j_as_in_jump),
923 ('K',k),
924 ('L',l),
925 ('M',m),
926 ('N',n),
927 ('NG',ng),
928 ('OW',o_as_in_go),
929 ('OY',oy_as_in_toy),
930 ('P',p),
931 ('R',r),
932 ('S',s),
933 ('SH',sh),
934 ('T',t),
935 ('TH',th_as_in_think),
936 ('UH',oor_as_in_poor),
937 ('UW',oo_as_in_food),
938 ('AO',close_to_or),
939 ('V',v),
940 ('W',w),
941 ('Y',y),
942 ('Z',z),
943 ('ZH',ge_of_blige_etc),
944 # lex_filename not set (does CMU have a lex file?)
945 safe_to_drop_characters=True, # TODO: really?
946 ),
947
948 # BEGIN PRE-32bit ERA SYNTHS (TODO: add an attribute to JS-hide them by default in HTML? what about the SpeakJet which probably isn't a 32-bit chip but is post 32-bit era? and then what about the 'approximation' formats - kana etc - would they need hiding by default also? maybe best to just leave it)
949 "apollo" : makeDic(
950 'Dolphin Apollo 2 serial-port and parallel-port hardware synthesizers (in case anybody still uses those)',
951 (syllable_separator,'',False), # I don't think the Apollo had anything to mark stress; TODO: control the pitch instead like bbcmicro ?
952 ('_QQ',syllable_separator,False), # a slight pause
953 ('_AA',a_as_in_apple),
954 ('_AI',a_as_in_ate),
955 ('_AR',a_as_in_ah),
956 ('_AW',close_to_or),
957 ('_A',a_as_in_ago),
958 ('_B',b),
959 ('_CH',ch),
960 ('_D',d),
961 ('_DH',th_as_in_them),
962 ('_EE',e_as_in_eat),
963 ('_EI',a_as_in_air),
964 ('_ER',e_as_in_herd),
965 ('_E',e_as_in_them),
966 ('_F',f),
967 ('_G',g),
968 ('_H',h),
969 ('_IA',ear),
970 ('_IE',eye),
971 ('_I',i_as_in_it),
972 ('_J',j_as_in_jump),
973 ('_K',k),
974 ('_KK',k,False), # sCHool
975 ('_L',l),
976 ('_M',m),
977 ('_NG',ng),
978 ('_N',n),
979 ('_OA',o_as_in_go),
980 ('_OO',opt_u_as_in_pull),
981 ('_OR',var3_close_to_or),
982 ('_OW',o_as_in_now),
983 ('_OY',oy_as_in_toy),
984 ('_O',o_as_in_orange),
985 ('_P',p),
986 ('_PP',p,False), # sPeech (a stronger P ?)
987 # _Q = k w - done by cleanup_regexps below
988 ('_R',r),
989 ('_SH',sh),
990 ('_S',s),
991 ('_TH',th_as_in_think),
992 ('_T',t), ('_TT',t,False),
993 ('_UU',oo_as_in_food),
994 ('_U',u_as_in_but),
995 ('_V',v),
996 ('_W',w),
997 # _X = k s - done by cleanup_regexps below
998 ('_Y',y),
999 ('_ZH',ge_of_blige_etc),
1000 ('_Z',z),
1001 # lex_filename not set (the hardware doesn't have one; HAL has an "exceptions dictionary" but I don't know much about it)
1002 approximate_missing=True,
1003 safe_to_drop_characters=True, # TODO: really?
1004 word_separator=" ",phoneme_separator="",
1005 cleanup_regexps=[('_K_W','_Q'),('_K_S','_X')],
1006 cvtOut_regexps=[('_Q','_K_W'),('_X','_K_S')],
1007 ),
1008 "dectalk" : makeDic(
1009 'DECtalk hardware synthesizers (American English)', # (1984-ish serial port; later ISA cards)
1010 (syllable_separator,'',False),
1011 ("'",primary_stress),
1012 ('aa',o_as_in_orange),
1013 ('ae',a_as_in_apple),
1014 ('ah',u_as_in_but),
1015 ('ao',close_to_or), # bought
1016 ('aw',o_as_in_now),
1017 ('ax',a_as_in_ago),
1018 ('ay',eye),
1019 ('b',b),
1020 ('ch',ch),
1021 ('d',d), ('dx',d,False),
1022 ('dh',th_as_in_them),
1023 ('eh',e_as_in_them),
1024 ('el',l,False), # -le of bottle, allophone ?
1025 # TODO: en: -on of button (2 phonemes?)
1026 ('ey',a_as_in_ate),
1027 ('f',f),
1028 ('g',g),
1029 ('hx',h),
1030 ('ih',i_as_in_it), ('ix',i_as_in_it,False),
1031 ('iy',e_as_in_eat), ('q',e_as_in_eat,False),
1032 ('jh',j_as_in_jump),
1033 ('k',k),
1034 ('l',l), ('lx',l,False),
1035 ('m',m),
1036 ('n',n),
1037 ('nx',ng),
1038 ('ow',o_as_in_go),
1039 ('oy',oy_as_in_toy),
1040 ('p',p),
1041 ('r',r), ('rx',r,False),
1042 ('rr',e_as_in_herd),
1043 ('s',s),
1044 ('sh',sh),
1045 ('t',t), ('tx',t,False),
1046 ('th',th_as_in_think),
1047 ('uh',opt_u_as_in_pull),
1048 ('uw',oo_as_in_food),
1049 ('v',v),
1050 ('w',w),
1051 ('yx',y),
1052 ('z',z),
1053 ('zh',ge_of_blige_etc),
1054 ('ihr',ear), # DECtalk makes this from ih + r
1055 approximate_missing=True,
1056 cleanup_regexps=[('yxuw','yu')], # TODO: other allophones ("x',False" stuff above)?
1057 cvtOut_regexps=[('yu','yxuw')],
1058 # lex_filename not set (depends on which model etc)
1059 stress_comes_before_vowel=True,
1060 safe_to_drop_characters=True, # TODO: really?
1061 word_separator=" ",phoneme_separator="",
1062 inline_header="[:phoneme on]\n",
1063 inline_format="[%s]",
1064 ),
1065 "doubletalk" : makeDic(
1066 'DoubleTalk PC/LT serial-port hardware synthesizers (American English; assumes DOS driver by default, otherwise set DTALK_COMMAND_CODE to your current command-code binary value, e.g. export DTALK_COMMAND_CODE=1)', # (1 is the synth's default; the DOS driver lets you put * instead)
1067 (syllable_separator,'',False),
1068 ("/",primary_stress), # TODO: check it doesn't need a balancing \ afterwards (docs do say it's a "temporary" change of pitch, but it's unclear how long a 'temporary')
1069 ('M',m),('N',n),('NX',ng),('O',o_as_in_go),
1070 ('OW',o_as_in_go,False), # allophone
1071 (o_as_in_orange,'O',False), # TODO: is this the best approximation we can do?
1072 ('OY',oy_as_in_toy),('P',p),
1073 ('R',r),('S',s),('SH',sh),('T',t),
1074 ('TH',th_as_in_think),('V',v),('W',w),('Z',z),
1075 ('ZH',ge_of_blige_etc),('K',k),('L',l),
1076 ('PX',p,False), ('TX',t,False), # aspirated allophones
1077 ('WH',w,False), ('KX',k,False), # ditto
1078 ('YY',y),('Y',y,False),
1079 ('UH',opt_u_as_in_pull),('UW',oo_as_in_food),
1080 ('AA',a_as_in_ah),('AE',a_as_in_apple),
1081 ('AH',u_as_in_but),('AO',close_to_or),
1082 ('AW',o_as_in_now),('AX',a_as_in_ago),
1083 ('AY',eye),('B',b),('CH',ch),('D',d),
1084 ('DH',th_as_in_them),
1085 ('DX',t,False), # an American "d"-like "t"
1086 ('EH',e_as_in_them),('ER',e_as_in_herd),
1087 ('EY',a_as_in_ate),('F',f),('G',g),('H',h),
1088 ('IH',i_as_in_it),('IX',i_as_in_it,False),
1089 ('IY',e_as_in_eat),('JH',j_as_in_jump),
1090 approximate_missing=True,
1091 stress_comes_before_vowel=True,
1092 inline_format=markup_doubleTalk_word,
1093 format_is_binary=ifset('DTALK_COMMAND_CODE',True),
1094 # DoubleTalk does have a loadable "exceptions dictionary" but usually relies on a DOS tool to write it; I don't have the documentation about it (and don't know how much RAM is available for it - it's taken from the input buffer)
1095 ),
1096 "keynote" : makeDic(
1097 'Phoneme-read and lexicon-add codes for Keynote Gold hardware synthesizers (American English)', # ISA, PCMCIA, serial, etc; non-serial models give you an INT 2Fh param to get the address of an API function to call; not sure which software can send these codes directly to it)
1098 (syllable_separator,'',False),
1099 (primary_stress,"'"),(secondary_stress,'"'),
1100 ('w',w),('y',y),('h',h),('m',m),('n',n),('ng',ng),
1101 ('l',l),('r',r),('f',f),('v',v),('s',s),('z',z),
1102 ('th',th_as_in_think),('dh',th_as_in_them),('k',k),
1103 ('ch',ch),('zh',ge_of_blige_etc),('sh',sh),('g',g),
1104 ('jh',j_as_in_jump),('b',b),('p',p),('d',d),('t',t),
1105 ('i',e_as_in_eat),('I',i_as_in_it),
1106 ('e',a_as_in_ate),('E',e_as_in_them),
1107 ('ae',a_as_in_apple),('u',oo_as_in_food),
1108 ('U',opt_u_as_in_pull),('o',o_as_in_go),
1109 ('O',close_to_or),('a',o_as_in_orange),
1110 ('^',u_as_in_but),('R',e_as_in_herd),
1111 ('ay',eye),('Oy',oy_as_in_toy),('aw',o_as_in_now),
1112 ('=',a_as_in_ago),
1113 approximate_missing=True,
1114 inline_format="[p]%s[t]",
1115 lex_filename="keynote.dat", # you have to somehow get this directly dumped to the card, see comment above
1116 lex_entry_format="[x]%s %s", lex_footer="[t]\n",
1117 stress_comes_before_vowel=False, # even though it's "'"
1118 ),
1119 "audapter" : makeVariantDic(
1120 "Audapter Speech System, an old hardware serial/parallel-port synthesizer (American English)", # 1989 I think. The phonemes themselves are the same as the Keynote above, but there's an extra binary byte in the commands and the lex format is stricter. I haven't checked but my guess is Audapter came before Keynote.
1121 inline_format='\x05[p] %s\x05[t]',
1122 format_is_binary=True,
1123 lex_filename="audapter.dat",
1124 lex_entry_format="\x05[x]%s %s\x05[t]\n", lex_footer="",
1125 ),
1126 "bbcmicro" : makeDic(
1127 "BBC Micro Speech program from 1985 (see comments in lexconvert.py for more details)",
1128 # Speech was written by David J. Hoskins and published by Superior Software. It took 7.5k of RAM including 3.1k of samples (49 phonemes + 1 for fricatives at 64 bytes each, 4-bit ~5.5kHz), 2.2k of lexicon, and 2.2k of machine code; sounds "retro" by modern standards but quite impressive for the BBC Micro in 1985. Samples are played by amplitude-modulating the BBC's tone generator.
1129 # If you use an emulator like BeebEm, you'll need diskimg/Speech.ssd. This can be made from your original Speech disc, or you might be able to find one but beware of copyright! Same goes with the ROM images included in BeebEm (you might want to delete ones you didn't have). There has been considerable discussion over whether UK copyright law does or should allow "format-shifting" your own legally-purchased media, and I don't fully understand all the discussion so I don't want to give advice on it here. The issue is "format-shifting" your legally-purchased BBC Micro ROM code and Speech disc to emulator images; IF this is all right then I suspect downloading someone else's copy is arguably allowed as long as you bought it legally "back in the day", but I'm not a solicitor so I don't know.
1130 # (Incidentally, yes I was the Silas Brown referred to in Beebug 11.1 p.59, 11.9 p.50/11.10 p.47 and 12.10 p.24, and, no, the question in the final issue wasn't quite how I put it, but all taken in good humour.)
1131 # lexconvert's --phones bbcmicro option creates *SPEAK commands which you can type into the BBC Micro or paste into an emulator, either at the BASIC prompt, or in a listing with line numbers provided by AUTO. You have to load the Speech program first of course.
1132 # To script this on BeebEm, first turn off the Speech disc's boot option (by turning off File / Disc options / Write protect and entering "*OPT 4,0"; use "*OPT 4,3" if you want it back later; if you prefer to edit the disk image outside of the emulator then change byte 0x106 from 0x33 to 0x03), and then you can do (e.g. on a Mac) open /usr/local/BeebEm3/diskimg/Speech.ssd && sleep 1 && (echo '*SPEECH';python lexconvert.py --phones bbcmicro "Greetings from 19 85") | pbcopy && osascript -e 'tell application "System Events" to keystroke "v" using command down'
1133 # or if you know it's already loaded: echo "Here is some text" | python lexconvert.py --phones bbcmicro | pbcopy && osascript -e 'tell application "BeebEm3" to activate' && osascript -e 'tell application "System Events" to keystroke "v" using command down'
1134 # (unfortunately there doesn't seem to be a way of doing it without giving the emulator window focus)
1135 # If you want to emulate a Master, you might need a *DISK before the *SPEECH (to take it out of ADFS mode).
1136 # You can also put Speech into ROM, but this can cause problems: see comments on SP8000 later.
1137 (syllable_separator,'',False),
1138 ('4',primary_stress),
1139 ('5',secondary_stress), # (these are pitch numbers on the BBC; normal pitch is 6, and lower numbers are higher pitches, so try 5=secondary and 4=primary; 3 sounds less calm)
1140 ('AA',a_as_in_ah),
1141 ('AE',a_as_in_apple),
1142 ('AH',u_as_in_but),
1143 ('O',o_as_in_orange),
1144 ('AW',o_as_in_now),
1145 (a_as_in_ago,'AH',False),
1146 ('ER',e_as_in_herd),
1147 ('IY',eye),
1148 ('B',b),
1149 ('CH',ch),
1150 ('D',d),
1151 ('DH',th_as_in_them),
1152 ('EH',e_as_in_them),
1153 (ar_as_in_year,'ER',False),
1154 ('AI',a_as_in_air),
1155 ('AY',a_as_in_ate),
1156 ('F',f),
1157 ('G',g),
1158 ('/H',h),
1159 ('IH',i_as_in_it),
1160 ('IX',var2_i_as_in_it), # (IX sounds to me like a slightly shorter version of IH)
1161 ('IXAH',ear),
1162 ('EER',var2_ear), # e.g. 'hear', 'near' - near enough
1163 ('EE',e_as_in_eat),
1164 ('J',j_as_in_jump),
1165 ('K',k),
1166 ('C',k,False), # for CT as in "fact", read out as K+T
1167 ('L',l),
1168 ('M',m),
1169 ('N',n),
1170 ('NX',ng),
1171 ('OW',o_as_in_go),
1172 ('OL',opt_ol_as_in_gold), # (if dest format doesn't have this, it'll get o_as_in_orange from the O, then the l)
1173 ('OY',oy_as_in_toy),
1174 ('P',p),
1175 ('R',r),
1176 ('S',s),
1177 ('SH',sh),
1178 ('T',t),
1179 ('TH',th_as_in_think),
1180 ('AOR',oor_as_in_poor),
1181 ('UH',oor_as_in_poor,False), # TODO: really? (espeak 'U' goes to opt_u_as_in_pull, and eSpeak also used U for the o in good, which sounds best with Speech's default UH4, hence the line below, but where did we get UH->oor_as_in_poor from? Low-priority though because how often do you convert OUT of bbcmicro format)
1182 (opt_u_as_in_pull,'UH',False),
1183 ('/U',opt_u_as_in_pull,False),
1184 ('/UL',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the /U, then l
1185 ('UW',oo_as_in_food),
1186 ('UX',oo_as_in_food,False),
1187 ('AO',close_to_or),
1188 ('V',v),
1189 ('W',w),
1190 ('Y',y),
1191 ('Z',z),
1192 ('ZH',ge_of_blige_etc),
1193 lex_filename=ifset("MAKE_SPEECH_ROM","SPEECH.ROM","BBCLEX"),
1194 lex_entry_format=as_utf8("> %s_")+chr(128)+as_utf8("%s"), # (specifying 'whole word' for now; remove the space before or the _ after if you want)
1195 lex_read_function = lambda lexfile: [(w[0].lstrip().rstrip('_').lower(),w[1]) for w in filter(lambda x:len(x)==2,[w.split(chr(128)) for w in getBuf(lexfile).read().split('>')])], # TODO: this reads back the entries we generate, but is unlikely to work well with the wildcards in the default lexicon that would have been added if SPEECH_DISK was set (c.f. trying to read eSpeak's en_rules instead of en_extra)
1196 lex_word_case = "upper",
1197 lex_header = bbc_prepDefaultLex,
1198 lex_footer = bbc_appendDefaultLex, # + ">**"
1199 inline_format = markup_bbcMicro_word,
1200 word_separator=" ",phoneme_separator="",
1201 clause_separator=write_bbcmicro_phones, # special case
1202 safe_to_drop_characters=True, # TODO: really?
1203 cleanup_regexps=[
1204 ('KT','CT'), # Speech instructions: "CT as in fact"
1205 ('DYUW','DUX'), # "DUX as in duke"
1206 ('AHR$','AH'), # usually sounds a bit better
1207 ],
1208 cvtOut_regexps=[('DUX','DYUW')], # CT handled above
1209 ),
1210 "bbcmicro-cc" : makeDic(
1211 "Computer Concepts Speech ROM which provided phonemes for the BBC Micro's TMS5220 \"speech chip\" add-on (less widely sold than the software-only product)", # (and harder to run on an emulator. It wasn't the only phoneme ROM, e.g. Easytalk Speech Utility ROM by Galaxy, reviewed in Beebug Jan/Feb 1985 (3.8) p.32, expanded on Acorn's original PHROM with commands like *SAY Y.U:N.I.V.ER.S but we don't know all the phonemes; there were also some allophone-based hardware boards)
1212 (syllable_separator,"",False),
1213 ('*',primary_stress),('+',secondary_stress),
1214 ('E',e_as_in_eat),('i',i_as_in_it),('e',e_as_in_them),
1215 ('a',a_as_in_apple),('u',u_as_in_but),('AR',a_as_in_ah),
1216 ('o',o_as_in_orange),('OR',close_to_or),('oo',opt_u_as_in_pull),
1217 ('OO',oo_as_in_food),('ER',e_as_in_herd),('A',a_as_in_ate),
1218 ('I',eye),('O',o_as_in_go),('OY',oy_as_in_toy),
1219 ('AW',o_as_in_now),('EA',ear),('ea',a_as_in_air),
1220 ('UR',oor_as_in_poor),('UH',a_as_in_ago),
1221 ('P',p),('B',b),('T',t),
1222 ('D',d),('K',k),('G',g),
1223 ('CH',ch),('J',j_as_in_jump),('F',f),
1224 ('V',v),('TH',th_as_in_think),('DH',th_as_in_them),
1225 ('S',s),('Z',z),('SH',sh),
1226 ('ZH',ge_of_blige_etc),('H',h),('M',m),
1227 ('N',n),('NG',ng),('L',l),
1228 ('R',r),('Y',y),('W',w),
1229 stress_comes_before_vowel=True,
1230 inline_header="*UTTER <1> ",
1231 clause_separator="\n*UTTER <1> ", # TODO: manual does not say what the maximum length is; longest parameter in examples is 80 bytes; should we use inline_format to make each WORD a separate command?
1232 cleanup_regexps=[('[*] ','*'),('[+] ','+')],
1233 safe_to_drop_characters=' ',
1234 ),
1235
1236 "amiga" : makeDic(
1237 'AmigaOS speech synthesizer (American English)', # shipped with the 1985 Amiga release; developed by SoftVoice Inc
1238 # All I had to go by for this was a screenshot on Marcos Miranda's "blog". I once saw this synth demonstrated but never tried it. My early background was the BBC Micro, not Amigas etc. But I know some people are keen on Amigas so I might as well include it.
1239 # (By the way I think David Hoskins had it harder than SoftVoice. Yes they were both in 1985, but the Amiga was a new 16-bit machine while the BBC was an older 8-bit one. See the "sam" format for an even older one though, although probably not written by one person.)
1240 (syllable_separator,'',False),
1241 ('4',primary_stress),('3',secondary_stress),
1242 ('/H',h),
1243 ('EH',e_as_in_them),
1244 ('L',l),
1245 ('OW',o_as_in_go),
1246 ('AY',eye),
1247 ('AE',a_as_in_apple),
1248 ('M',m),
1249 ('DH',th_as_in_them),
1250 ('IY',e_as_in_eat),
1251 ('AH',a_as_in_ago),
1252 ('G',g),
1253 ('K',k),
1254 ('U',u_as_in_but),
1255 ('P',p),
1256 ('Y',y),
1257 ('UW',oo_as_in_food),
1258 ('T',t),
1259 ('ER',var1_a_as_in_ago),
1260 ('IH',i_as_in_it),
1261 ('S',s),
1262 ('Z',z),
1263 ('AW',o_as_in_now),
1264 ('AA',a_as_in_ah),
1265 ('R',r),
1266 ('D',d),('F',f),('N',n),('NX',ng),('J',j_as_in_jump),
1267 ('B',b),('V',v),('TH',th_as_in_think),
1268 ('OH',close_to_or),('EY',a_as_in_ate),
1269 # The following consonants were not on the screenshot
1270 # (or at least I couldn't find them) so I'm guessing.
1271 # I think this should work given the way the other
1272 # consonants work in this table.
1273 ('W',w),('CH',ch),('SH',sh),
1274 # The following vowels were not in the screenshot and
1275 # we just have to hope this guess is right - when
1276 # someone tries it on an Amiga and says it doesn't
1277 # work, maybe we can update this....
1278 ('O',o_as_in_orange),('OY',oy_as_in_toy),
1279 # and these ones we can approximate to ones we already know (given that we're having to approximate British to an American voice anyway, it can't hurt TOO much more)
1280 (a_as_in_air,'EH',False),
1281 (e_as_in_herd,'ER',False),
1282 (ar_as_in_year,'ER',False),
1283 (ear,'IYAH',False), # or try IYER, or there might be a phoneme for it
1284 (ge_of_blige_etc,'J',False),
1285 (oor_as_in_poor,'OH',False),
1286 # lex_filename not set (I have no idea how the Amiga lexicon worked)
1287 safe_to_drop_characters=True, # TODO: really?
1288 word_separator=" ",phoneme_separator="",
1289 ),
1290 "sam" : makeDic(
1291 'Software Automatic Mouth (1982 American English synth that ran on C64, Atari 400/800/etc and Apple II/etc)', # *might* be similar to Macintalk on the 1st Macintosh in 1984
1292 (syllable_separator,'',False),
1293 (primary_stress,'4'),
1294 (secondary_stress,'5'),
1295 ('IY',e_as_in_eat),
1296 ('IH',i_as_in_it),
1297 ('EH',e_as_in_them),
1298 ('AE',a_as_in_apple),
1299 ('AA',o_as_in_orange),
1300 ('AH',u_as_in_but),
1301 ('AO',close_to_or),
1302 ('OH',o_as_in_go),
1303 ('UH',opt_u_as_in_pull),
1304 ('UX',oo_as_in_food),
1305 ('ER',e_as_in_herd),
1306 ('AX',a_as_in_apple,False), # allophone?
1307 ('IX',i_as_in_it,False), # allophone?
1308 ('EY',a_as_in_ate),
1309 ('AY',eye),('OY',oy_as_in_toy),
1310 ('AW',o_as_in_now),('OW',o_as_in_go,False),
1311 ('UW',oo_as_in_food,False), # allophone?
1312 ('R',r),('L',l),('W',w),('WH',w,False),('Y',y),('M',m),
1313 ('N',n),('NX',ng),('B',b),('D',d),('G',g),('Z',z),
1314 ('J',j_as_in_jump),('ZH',ge_of_blige_etc),('V',v),
1315 ('DH',th_as_in_them),('S',s),('SH',sh),('F',f),
1316 ('TH',th_as_in_think),('P',p),('T',t),('K',k),
1317 ('CH',ch),('/H',h),('Q',glottal_stop),
1318 approximate_missing=True,
1319 word_separator=" ",phoneme_separator="",
1320 # TODO: inline_format etc similar to bbcmicro?
1321 # In Atari BASIC, you set SAM$ to the phonemes and then
1322 # do A=USR(8192). I don't know about the C64 etc versions.
1323 # (max 255 phonemes per string; don't know max line len.)
1324 ),
1325
1326 "cheetah" : makeDic(
1327 'Allophone codes for the 1983 "Cheetah Sweet Talker" SP0256-based hardware add-on for ZX Spectrum and BBC Micro home computers. The conversion from phonemes to allophones might need tweaking.',
1328 (syllable_separator,'',False),
1329 ("0",syllable_separator,False),
1330 ("1",syllable_separator,False),
1331 ("2",syllable_separator,False),
1332 ("3",syllable_separator,False),
1333 ("4",syllable_separator,False),
1334 ("5",oy_as_in_toy),
1335 ("6",eye),
1336 ("7",e_as_in_them),
1337 ("8",k,False),
1338 ("9",p),
1339 ("10",j_as_in_jump),
1340 ("11",n),
1341 ("12",i_as_in_it),
1342 ("13",t),
1343 ("14",r),
1344 ("15",u_as_in_but),
1345 ("16",m),
1346 ("17",t,False),
1347 ("18",th_as_in_them),
1348 ("19",e_as_in_eat),
1349 ("20",a_as_in_ate),
1350 ("21",d),
1351 ("22",oo_as_in_food),
1352 ("23",close_to_or),
1353 ("24",o_as_in_orange),
1354 ("25",y),
1355 ("26",a_as_in_apple),
1356 ("27",h),
1357 ("28",b),
1358 ("29",th_as_in_think),
1359 (opt_u_as_in_pull,"30",False),
1360 ("30",opt_ul_as_in_pull),
1361 ("31",oo_as_in_food,False),
1362 ("32",o_as_in_now),
1363 ("33",d,False),
1364 ("34",g,False),
1365 ("35",v),
1366 ("36",g),
1367 ("37",sh),
1368 ("38",ge_of_blige_etc),
1369 ("39",r,False),
1370 ("40",f),
1371 ("41",k),
1372 ("42",k,False),
1373 ("43",z),
1374 ("44",ng),
1375 ("45",l),
1376 ("46",w),
1377 ("47",a_as_in_air),
1378 ("49",y,False),
1379 ("50",ch),
1380 ("51",a_as_in_ago),
1381 ("52",e_as_in_herd),
1382 (var1_a_as_in_ago,"52",False),
1383 ("53",o_as_in_go),
1384 ("54",th_as_in_them,False),
1385 ("55",s),
1386 ("56",n,False),
1387 ("57",h,False),
1388 ("58",var3_close_to_or),
1389 ("59",a_as_in_ah),
1390 ("60",ear), # or var2_ear
1391 ("61",g,False),
1392 ("62",l,False),
1393 ("63",b,False),
1394 approximate_missing=True,
1395 phoneme_separator=',',safe_to_drop_characters=",",
1396 inline_header="DATA ",inline_footer=",0"),
1397
1398 # END (?) PRE-32bit ERA SYNTHS (but see TODO above re SpeakJet, which is below)
1399
1400 "speakjet" : makeDic(
1401 'Allophone codes for the American English "SpeakJet" speech synthesis chip (the conversion from phonemes to allophones might need tweaking). Set the SPEAKJET_SYM environment variable to use mnemonics, otherwise numbers are used (set SPEAKJET_BINARY for binary output).',
1402 # TODO: might want to do something similar for the older Votrax SC-02 chip, but would need to check how exactly its phoneme interface was exposed to software by the PC cards that used it (Heathkit HV-2000 etc; not sure if any are still in use though)
1403 (syllable_separator,'',False), # TODO: instead of having emphasis, the Speakjet has a 'faster' code for all NON-emphasized syllables
1404 (speakjet('IY',128),e_as_in_eat),
1405 (speakjet('IH',129),i_as_in_it),
1406 (speakjet('EY',130),a_as_in_ate),
1407 (speakjet('EH',131),e_as_in_them),
1408 (speakjet('AY',132),a_as_in_apple),
1409 (speakjet('AX',133),a_as_in_ago),
1410 (speakjet('UX',134),u_as_in_but),
1411 (speakjet('OH',135),o_as_in_orange),
1412 (speakjet('AW',136),a_as_in_ah),
1413 (speakjet('OW',137),o_as_in_go),
1414 (speakjet('UH',138),opt_u_as_in_pull),
1415 (speakjet('UW',139),oo_as_in_food),
1416 (speakjet('MM',140),m),
1417 (speakjet('NE',141),n,False),
1418 (speakjet('NO',142),n),
1419 (speakjet('NGE',143),ng,False),
1420 (speakjet('NGO',144),ng),
1421 (speakjet('LE',145),l,False),
1422 (speakjet('LO',146),l),
1423 (speakjet('WW',147),w),
1424 (speakjet('RR',148),r),
1425 (speakjet('IYRR',149),ear),
1426 (speakjet('EYRR',150),a_as_in_air),
1427 (speakjet('AXRR',151),e_as_in_herd),
1428 (speakjet('AWRR',152),a_as_in_ah,False),
1429 (speakjet('OWRR',153),close_to_or),
1430 (speakjet('EYIY',154),a_as_in_ate,False),
1431 (speakjet('OHIY',155),eye),
1432 (speakjet('OWIY',156),oy_as_in_toy),
1433 (speakjet('OHIH',157),eye,False),
1434 (speakjet('IYEH',158),y),
1435 (speakjet('EHLL',159),l,False),
1436 (speakjet('IYUW',160),oo_as_in_food,False),
1437 (speakjet('AXUW',161),o_as_in_now),
1438 (speakjet('IHUW',162),oo_as_in_food,False),
1439 # TODO: 163 AYWW = o_as_in_now a_as_in_ago ? handle in cleanup_regexps + cvtOut_regexps ?
1440 (speakjet('OWWW',164),o_as_in_go,False),
1441 (speakjet('JH',165),j_as_in_jump),
1442 (speakjet('VV',166),v),
1443 (speakjet('ZZ',167),z),
1444 (speakjet('ZH',168),ge_of_blige_etc),
1445 (speakjet('DH',169),th_as_in_them),
1446 # TODO: get cleanup_regexps to clean up some of these according to what's coming next etc:
1447 (speakjet('BE',170),b,False),
1448 (speakjet('BO',171),b),
1449 (speakjet('EB',172),b,False),
1450 (speakjet('OB',173),b,False),
1451 (speakjet('DE',174),d,False),
1452 (speakjet('DO',175),d),
1453 (speakjet('ED',176),d,False),
1454 (speakjet('OD',177),d,False),
1455 (speakjet('GE',178),g,False),
1456 (speakjet('GO',179),g),
1457 (speakjet('EG',180),g,False),
1458 (speakjet('OG',181),g,False),
1459 (speakjet('CH',182),ch),
1460 (speakjet('HE',183),h,False),
1461 (speakjet('HO',184),h),
1462 (speakjet('WH',185),w,False),
1463 (speakjet('FF',186),f),
1464 (speakjet('SE',187),s,False),
1465 (speakjet('SO',188),s),
1466 (speakjet('SH',189),sh),
1467 (speakjet('TH',190),th_as_in_think),
1468 (speakjet('TT',191),t),
1469 (speakjet('TU',192),t,False),
1470 # TODO: 193 TS in cleanup_regexps and cvtOut_regexps
1471 (speakjet('KE',194),k,False),
1472 (speakjet('KO',195),k),
1473 (speakjet('EK',196),k,False),
1474 (speakjet('OK',197),k,False),
1475 (speakjet('PE',198),p,False),
1476 (speakjet('PO',199),p),
1477 # lex_filename not set (I think the front-end software might have one, but don't know if it's accessible; chip itself just takes phonemes)
1478 approximate_missing=True,
1479 word_separator=ifset('SPEAKJET_BINARY',""," "),
1480 phoneme_separator=ifset('SPEAKJET_BINARY',""," "),
1481 clause_separator=ifset('SPEAKJET_BINARY',"","\n"), # TODO: is there a pause code?
1482 output_is_binary=ifset('SPEAKJET_BINARY',True),
1483 safe_to_drop_characters=True, # TODO: really?
1484 ),
1485
1486 "rsynth" : makeDic(
1487 'rsynth text-to-speech C library (American English)', # TODO: test
1488 (syllable_separator,'',False), # TODO: emphasis?
1489 ("i:",e_as_in_eat),
1490 ("I",i_as_in_it),
1491 ("eI",a_as_in_ate),
1492 ("E",e_as_in_them),
1493 ("{",a_as_in_apple),
1494 ("V",u_as_in_but),
1495 ("Q",o_as_in_orange),
1496 ("A:",a_as_in_ah),
1497 ("oU",o_as_in_go),
1498 ("U",opt_u_as_in_pull),
1499 ("u:",oo_as_in_food),
1500 ("m",m),
1501 ("n",n),
1502 ("N",ng),
1503 ("l",l),
1504 ("w",w),
1505 ("r",r),
1506 ("I@",ear),
1507 ("e@",a_as_in_air),
1508 ("3:",e_as_in_herd),
1509 ("Qr",close_to_or),
1510 ("OI",oy_as_in_toy),
1511 ("aI",eye),
1512 ("j",y),
1513 ("U@",oo_as_in_food,False),
1514 ("aU",o_as_in_now),
1515 ("@U",o_as_in_go,False),
1516 ("dZ",j_as_in_jump),
1517 ("v",v),
1518 ("z",z),
1519 ("Z",ge_of_blige_etc),
1520 ("D",th_as_in_them),
1521 ("b",b),
1522 ("d",d),
1523 ("g",g),
1524 ("tS",ch),
1525 ("h",h),
1526 ("f",f),
1527 ("s",s),
1528 ("S",sh),
1529 ("T",th_as_in_think),
1530 ("t",t),
1531 ("k",k),
1532 ("p",p),
1533 approximate_missing=True,
1534 # lex_filename not set (TODO: check what sort of lexicon is used by rsynth's "say" front-end)
1535 safe_to_drop_characters=True, # TODO: really?
1536 word_separator=" ",phoneme_separator="",
1537 ),
1538
1539 "unicode-ipa" : makeDic(
1540 "IPA symbols in Unicode, as used by an increasing number of dictionary programs, websites etc",
1541 ('.',syllable_separator,False),
1542 (syllable_separator,'',False),
1543 (u'\u02c8',primary_stress),
1544 (u'\u02cc',secondary_stress),
1545 # NB the above two are "modifier", not "combining",
1546 # Unicode characters. There IS a difference. If
1547 # your software displays them as overprinting the
1548 # surrounding letters, you have a bug.
1549 # (E.g. WeChat v1.2.2.1 on Mac OS 10.7)
1550 ('#',text_sharp),
1551 ('_',text_underline),
1552 ('?',text_question),
1553 ('!',text_exclamation),
1554 (',',text_comma),
1555 (u'\u0251',a_as_in_ah),
1556 (u'\u02d0',ipa_colon),
1557 (u'\u0251\u02d0',var3_a_as_in_ah),
1558 (u'\u0251\u0279',var4_a_as_in_ah),
1559 (u'a\u02d0',var5_a_as_in_ah),
1560 (u'\xe6',a_as_in_apple),
1561 ('a',a_as_in_apple,False),
1562 (u'\u028c',u_as_in_but),
1563 ('\u1d27',u_as_in_but,False), # 28c sometimes mistakenly written as 1d27
1564 (u'\u0252',o_as_in_orange),
1565 (var1_o_as_in_orange,u'\u0251',False),
1566 (u'\u0254',var2_o_as_in_orange),
1567 (u'a\u028a',o_as_in_now),
1568 (u'\xe6\u0254',var1_o_as_in_now),
1569 (u'\u0259',a_as_in_ago),
1570 (u'\u0259\u02d0',e_as_in_herd),
1571 (u'\u025a',var1_a_as_in_ago),
1572 (u'a\u026a',eye), (u'\u028c\u026a',eye,False),
1573 (u'\u0251e',var1_eye),
1574 ('b',b),
1575 (u't\u0283',ch),
1576 (u'\u02a7',ch,False),
1577 ('d',d),
1578 (u'\xf0',th_as_in_them),
1579 (u'\u025b',e_as_in_them),
1580 ('e',var1_e_as_in_them),
1581 (u'\u025d',ar_as_in_year),
1582 (u'\u025c\u02d0',ar_as_in_year,False),
1583 (u'\u025b\u0259',a_as_in_air),
1584 (u'\u025b\u0279',var1_a_as_in_air),
1585 (u'e\u02d0',var2_a_as_in_air),
1586 (u'\u025b\u02d0',var3_a_as_in_air),
1587 (u'e\u0259',var4_a_as_in_air),
1588 (u'e\u026a',a_as_in_ate),
1589 (u'\xe6\u026a',var1_a_as_in_ate),
1590 ('f',f),
1591 (u'\u0261',g), ('g',g,False),
1592 ('h',h),
1593 (u'\u026a',i_as_in_it),
1594 (u'\u0268',var1_i_as_in_it),
1595 (u'\u026a\u0259',ear),
1596 (u'\u026a\u0279',var1_ear),
1597 (u'\u026a\u0279\u0259',var2_ear), # ?
1598 ('i',e_as_in_eat),
1599 (u'i\u02d0',var1_e_as_in_eat),
1600 (u'd\u0292',j_as_in_jump),
1601 (u'\u02a4',j_as_in_jump,False),
1602 ('k',k),
1603 ('x',opt_scottish_loch),
1604 ('l',l),
1605 (u'd\u026b',var1_l),
1606 ('m',m),
1607 ('n',n),
1608 (u'\u014b',ng),
1609 (u'\u0259\u028a',o_as_in_go),
1610 ('o',var1_o_as_in_go),
1611 (u'o\u028a',var2_o_as_in_go),
1612 (u'\u0259\u0289',var1_u_as_in_but),
1613 (u'\u0254\u026a',oy_as_in_toy),
1614 (u'o\u026a',var1_oy_as_in_toy),
1615 ('p',p),
1616 (u'\u0279',r), ('r',r,False),
1617 (var1_r,'r',False),
1618 ('s',s),
1619 (u'\u0283',sh),
1620 ('t',t),
1621 (u'\u027e',var1_t),
1622 (u'\u03b8',th_as_in_think),
1623 (u'\u028a\u0259',oor_as_in_poor),
1624 (u'\u028a\u0279',var1_oor_as_in_poor),
1625 (u'\u028a',opt_u_as_in_pull),
1626 (u'\u0289\u02d0',oo_as_in_food),
1627 (u'u\u02d0',var1_oo_as_in_food),
1628 ('u',var2_oo_as_in_food),
1629 (u'\u0254\u02d0',close_to_or),
1630 (var1_close_to_or,u'\u0254',False),
1631 (u'o\u02d0',var2_close_to_or),
1632 ('v',v),
1633 ('w',w),
1634 (u'\u028d',var1_w),
1635 ('j',y),
1636 ('z',z),
1637 (u'\u0292',ge_of_blige_etc),
1638 (u'\u0294',glottal_stop),
1639 lex_filename="words-ipa.html", # write-only for now
1640 lex_type = "HTML",
1641 lex_header = '<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head><body><table>',
1642 lex_entry_format="<tr><td>%s</td><td>%s</td></tr>\n",
1643 lex_footer = "</table></body></html>\n",
1644 word_separator=" ",phoneme_separator="",
1645 stress_comes_before_vowel=True,
1646 safe_to_drop_characters=True, # TODO: really? (at least '-' should be safe to drop)
1647 cvtOut_func=unicode_preprocess,
1648 ),
1649
1650 "unicode-ipa-syls" : makeVariantDic(
1651 "Like unicode-ipa but with syllable separators preserved",
1652 (syllable_separator,'.'),
1653 cleanup_regexps=[(r"\.+",".")], # multiple . to one .
1654 noInherit=True),
1655
1656 "yinghan" : makeVariantDic(
1657 "As unicode-ipa but, when converting a user lexicon, generates Python code that reads Wenlin Yinghan dictionary entries and adds IPA bands to matching words",
1658 lex_filename="yinghan-ipa.py", # write-only for now
1659 lex_type = "Python script",
1660 lex_header = r"""#!/usr/bin/env python
1661 # -*- coding: utf-8 -*-
1662
1663 # Works in both Python 2 and Python 3
1664
1665 import sys; d={""",
1666 lex_entry_format='u"%s":u"%s",\n',
1667 lex_footer = r"""}
1668 import re
1669 try: i,o=sys.stdin.buffer,sys.stdout.buffer # Python 3
1670 except AttributeError: i,o=sys.stdin,sys.stdout # Python 2
1671 for k in list(d.keys()): d[k.lower().encode('utf-8')]=d[k]
1672 nextIsHead=False
1673 for l in i:
1674 o.write(l)
1675 if nextIsHead and l.strip():
1676 w=l.split()
1677 if w[0]==u'ehw'.encode('utf-8'): l=u' '.encode('utf-8').join(w[1:])
1678 k = re.sub(u'\\([^)]*\\)$'.encode('utf-8'),u''.encode('utf-8'),l.strip()).strip().lower() # (allow parenthesised explanation after headword when matching)
1679 if k in d: o.write(u'ipa '.encode('utf-8')+d[k].encode('utf-8')+u'\n'.encode('utf-8'))
1680 if l.startswith(u'*** '.encode('utf-8')): nextIsHead=True
1681 """,
1682 noInherit=True
1683 ),
1684
1685 "unicode-rough" : makeVariantDic(
1686 "A non-standard notation that's reminiscent of unicode-ipa but changed so that more of the characters show in old browsers with incomplete fonts",
1687 ("'",primary_stress),
1688 (',',secondary_stress),
1689 ('ar-',a_as_in_ah),
1690 (':',ipa_colon),
1691 (var3_a_as_in_ah,'ar-',False),
1692 (var4_a_as_in_ah,'ar-',False),
1693 ('uh',u_as_in_but),
1694 (u'\u0259:',e_as_in_herd),
1695 ('ai',eye),
1696 ('ch',ch),
1697 ('e',e_as_in_them),
1698 ('3:',ar_as_in_year),
1699 (a_as_in_air,'e:',False),
1700 (var1_a_as_in_air,'e:',False),
1701 (var2_a_as_in_air,'e:',False),
1702 (var3_a_as_in_air,'e:',False),
1703 (var4_a_as_in_air,'e:',False),
1704 (u'ei',a_as_in_ate),
1705 (u'\xe6i',var1_a_as_in_ate),
1706 ('g',g),
1707 ('i',i_as_in_it), (var1_i_as_in_it,'i',False),
1708 ('eeuh-',ear), (var2_ear,'eeuh-',False),
1709 ('ee',e_as_in_eat), (var1_e_as_in_eat,'ee',False),
1710 ('j',j_as_in_jump),
1711 ('ng',ng),
1712 ('o',o_as_in_go),
1713 (var2_o_as_in_go,'o',False), # override unicode-ipa
1714 (var1_u_as_in_but,'o',False), # ditto (?? '+'?)
1715 ('oy',oy_as_in_toy), (var1_oy_as_in_toy,'oy',False),
1716 ('r',r),
1717 ('sh',sh),
1718 (var1_t,'t',False),
1719 ('th',th_as_in_think),
1720 ('or',oor_as_in_poor),
1721 (var1_oor_as_in_poor,'or',False),
1722 ('u',opt_u_as_in_pull), ('oo',oo_as_in_food),
1723 (var1_oo_as_in_food,'oo',False),
1724 (var2_oo_as_in_food,'oo',False),
1725 (close_to_or,'or',False),
1726 (var1_close_to_or,'or',False),
1727 (var2_close_to_or,'or',False),
1728 (var1_w,'w',False),
1729 ('y',y),
1730 ('3',ge_of_blige_etc),
1731 cleanup_regexps=[('-$','')],
1732 cvtOut_func=None,
1733 ),
1734
1735 "braille-ipa" : makeDic(
1736 "IPA symbols in Braille (2008 BANA standard). By default Braille ASCII is output; if you prefer to see the Braille dots via Unicode, set the BRAILLE_UNICODE environment variable.", # BANA = Braille Authority of North America. TODO: check if the UK accepted this standard.
1737 # TODO: add Unicode IPA signs that aren't used in English IPA, so we can do a general IPA conversion
1738 ('_B',primary_stress),
1739 ('_2',secondary_stress),
1740 ('*',a_as_in_ah),
1741 ('3',ipa_colon),
1742 ('*3',var3_a_as_in_ah),
1743 ('*#',var4_a_as_in_ah),
1744 ('A3',var5_a_as_in_ah),
1745 ('%',a_as_in_apple),
1746 ('A',a_as_in_apple,False),
1747 ('+',u_as_in_but),
1748 ('4*',o_as_in_orange),
1749 (var1_o_as_in_orange,'*',False),
1750 ('<',var2_o_as_in_orange),
1751 ('A(',o_as_in_now),
1752 ('%<',var1_o_as_in_now),
1753 ('5',a_as_in_ago),
1754 ('53',e_as_in_herd),
1755 ('5"R.',var1_a_as_in_ago),
1756 ('A/',eye),
1757 ('*E',var1_eye),
1758 ('B',b),
1759 ('T:',ch),
1760 ('T":.',ch,False),
1761 ('D',d),
1762 (']',th_as_in_them),
1763 ('>',e_as_in_them),
1764 ('E',var1_e_as_in_them),
1765 ('4>3',ar_as_in_year), # (from \u025c\u02d0; TODO: check what happens to \u025d)
1766 ('>5',a_as_in_air),
1767 ('>#',var1_a_as_in_air),
1768 ('E3',var2_a_as_in_air),
1769 ('>3',var3_a_as_in_air),
1770 ('E5',var4_a_as_in_air),
1771 ('E/',a_as_in_ate),
1772 ('%/',var1_a_as_in_ate),
1773 ('F',f),
1774 ('G',g),
1775 ('H',h),
1776 ('/',i_as_in_it),
1777 ('0I',var1_i_as_in_it),
1778 ('/5',ear),
1779 ('/#',var1_ear),
1780 ('/#5',var2_ear), # ?
1781 ('I',e_as_in_eat),
1782 ('I3',var1_e_as_in_eat),
1783 ('D!',j_as_in_jump),
1784 ('K',k),
1785 ('X',opt_scottish_loch),
1786 ('L',l),
1787 ('D6L',var1_l),
1788 ('M',m),
1789 ('N',n),
1790 ('$',ng),
1791 ('5(',o_as_in_go),
1792 ('O',var1_o_as_in_go),
1793 ('O(',var2_o_as_in_go),
1794 ('50U',var1_u_as_in_but),
1795 ('</',oy_as_in_toy),
1796 ('O/',var1_oy_as_in_toy),
1797 ('P',p),
1798 ('#',r),
1799 (var1_r,'R',False),
1800 ('S',s),
1801 (':',sh),
1802 ('T',t),
1803 ('6R',var1_t),
1804 ('.?',th_as_in_think),
1805 ('(5',oor_as_in_poor),
1806 ('(#',var1_oor_as_in_poor),
1807 ('(',opt_u_as_in_pull),
1808 ('0U3',oo_as_in_food),
1809 ('U3',var1_oo_as_in_food),
1810 ('U',var2_oo_as_in_food),
1811 ('<3',close_to_or),
1812 (var1_close_to_or,'<',False),
1813 ('O3',var2_close_to_or),
1814 ('V',v),
1815 ('W',w),
1816 ('6W',var1_w),
1817 ('J',y),
1818 ('Z',z),
1819 ('!',ge_of_blige_etc),
1820 ('2',glottal_stop),
1821 lex_filename=ifset("BRAILLE_UNICODE","words-ipa.txt","words-ipa.brl"), # write-only for now
1822 lex_type = "document",
1823 # inline_format=",7%s7'", # -> do this in cleanup_func so it's included in BRAILLE_UNICODE if necessary
1824 lex_entry_format="%s = %s\n", # ditto with the markers
1825 word_separator=" ",phoneme_separator="",
1826 stress_comes_before_vowel=True,
1827 safe_to_drop_characters=True, # TODO: really?
1828 cleanup_func=lambda r:ifset("BRAILLE_UNICODE",ascii_braille_to_unicode,lambda x:x)(",7"+r+"7'"),
1829 cvtOut_func=unicode_to_ascii_braille,
1830 ),
1831
1832 "latex-ipa" : makeDic(
1833 'IPA symbols for typesetting in LaTeX using the "tipa" package',
1834 ('.',syllable_separator,False),
1835 ('"',primary_stress),
1836 ('\\textsecstress{}',secondary_stress),
1837 ('\\#',text_sharp),
1838 ('\\_',text_underline),
1839 ('?',text_question),
1840 ('!',text_exclamation),
1841 (',',text_comma),
1842 ('A',a_as_in_ah),
1843 (':',ipa_colon),
1844 ('A:',var3_a_as_in_ah),
1845 ('A\\textturnr{}',var4_a_as_in_ah),
1846 ('a:',var5_a_as_in_ah),
1847 ('\\ae{}',a_as_in_apple),
1848 ('2',u_as_in_but),
1849 ('6',o_as_in_orange),
1850 (var1_o_as_in_orange,'A',False),
1851 ('O',var2_o_as_in_orange),
1852 ('aU',o_as_in_now),
1853 ('\\ae{}O',var1_o_as_in_now),
1854 ('@',a_as_in_ago),
1855 ('@:',e_as_in_herd),
1856 ('\\textrhookschwa{}',var1_a_as_in_ago),
1857 ('aI',eye),
1858 ('Ae',var1_eye),
1859 ('b',b),
1860 ('tS',ch),
1861 ('d',d),
1862 ('D',th_as_in_them),
1863 ('E',e_as_in_them),
1864 ('e',var1_e_as_in_them),
1865 ('3:',ar_as_in_year),
1866 ('E@',a_as_in_air),
1867 ('E\\textturnr{}',var1_a_as_in_air),
1868 ('e:',var2_a_as_in_air),
1869 ('E:',var3_a_as_in_air),
1870 ('e@',var4_a_as_in_air),
1871 ('eI',a_as_in_ate),
1872 ('\\ae{}I',var1_a_as_in_ate),
1873 ('f',f),
1874 ('g',g),
1875 ('h',h),
1876 ('I',i_as_in_it),
1877 ('1',var1_i_as_in_it),
1878 ('I@',ear),
1879 ('I\\textturnr{}',var1_ear),
1880 ('I@\\textturnr{}',var2_ear), # ?
1881 ('i',e_as_in_eat),
1882 ('i:',var1_e_as_in_eat),
1883 ('dZ',j_as_in_jump),
1884 ('k',k),
1885 ('x',opt_scottish_loch),
1886 ('l',l),
1887 ('d\\textltilde{}',var1_l),
1888 ('m',m),
1889 ('n',n),
1890 ('N',ng),
1891 ('@U',o_as_in_go),
1892 ('o',var1_o_as_in_go),
1893 ('oU',var2_o_as_in_go),
1894 ('@0',var1_u_as_in_but),
1895 ('OI',oy_as_in_toy),
1896 ('oI',var1_oy_as_in_toy),
1897 ('p',p),
1898 ('\\textturnr{}',r),
1899 (var1_r,'r',False),
1900 ('s',s),
1901 ('S',sh),
1902 ('t',t),
1903 ('R',var1_t),
1904 ('T',th_as_in_think),
1905 ('U@',oor_as_in_poor),
1906 ('U\\textturnr{}',var1_oor_as_in_poor),
1907 ('U',opt_u_as_in_pull),
1908 ('0:',oo_as_in_food),
1909 ('u:',var1_oo_as_in_food),
1910 ('u',var2_oo_as_in_food),
1911 ('O:',close_to_or),
1912 (var1_close_to_or,'O',False),
1913 ('o:',var2_close_to_or),
1914 ('v',v),
1915 ('w',w),
1916 ('\\textturnw{}',var1_w),
1917 ('j',y),
1918 ('z',z),
1919 ('Z',ge_of_blige_etc),
1920 ('P',glottal_stop),
1921 lex_filename="words-ipa.tex", # write-only for now
1922 lex_type = "document",
1923 lex_header = r'\documentclass[12pt,a4paper]{article} \usepackage[safe]{tipa} \usepackage{longtable} \begin{document} \begin{longtable}{ll}',
1924 lex_entry_format=r"%s & \textipa{%s}\\"+"\n",
1925 lex_footer = r"\end{longtable}\end{document}"+"\n",
1926 inline_format = "\\textipa{%s}",
1927 inline_oneoff_header = r"% In preamble, put \usepackage[safe]{tipa}"+"\n", # (the [safe] part is recommended if you're mixing with other TeX)
1928 word_separator=" ",phoneme_separator="",
1929 clause_separator=r"\\"+"\n",
1930 stress_comes_before_vowel=True,
1931 safe_to_drop_characters=True, # TODO: really?
1932 ),
1933
1934 "pinyin-approx" : makeDic(
1935 "Rough approximation using roughly the spelling rules of Chinese Pinyin (for getting Chinese-only voices to speak some English words; works with some words better than others)", # write-only for now
1936 ('4',primary_stress),
1937 ('2',secondary_stress),
1938 ('a5',a_as_in_ah),
1939 ('ya5',a_as_in_apple),
1940 ('e5',u_as_in_but),
1941 ('yo5',o_as_in_orange),
1942 ('ao5',o_as_in_now),
1943 (e_as_in_herd,'e5',False),
1944 ('ai5',eye),
1945 ('bu0',b),
1946 ('che0',ch),
1947 ('de0',d),
1948 ('ze0',th_as_in_them),
1949 ('ye5',e_as_in_them),
1950 (a_as_in_air,'ye5',False),
1951 ('ei5',a_as_in_ate),
1952 ('fu0',f),
1953 ('ge0',g),
1954 ('he0',h),
1955 ('yi5',i_as_in_it),
1956 ('yi3re5',ear),
1957 (e_as_in_eat,'yi5',False),
1958 ('zhe0',j_as_in_jump),
1959 ('ke0',k),
1960 ('le0',l),
1961 ('me0',m),
1962 ('ne0',n),
1963 ('eng0',ng),
1964 ('ou5',o_as_in_go),
1965 ('ruo2yi5',oy_as_in_toy),
1966 ('pu0',p),
1967 ('re0',r),
1968 ('se0',s),
1969 ('she0',sh),
1970 ('te0',t),
1971 (th_as_in_think,'zhe0',False),
1972 (oor_as_in_poor,'wu5',False),
1973 ('yu5',oo_as_in_food),
1974 ('huo5',close_to_or),
1975 (v,'fu0',False),
1976 ('wu0',w),
1977 ('yu0',y),
1978 (z,'ze0',False),
1979 (ge_of_blige_etc,'zhe0',False),
1980 approximate_missing=True,
1981 lex_filename="words-pinyin-approx.txt", # write-only for now
1982 lex_type = "text",
1983 lex_header = "Pinyin approxmations (very approximate!)\n----------------------------------------\n",
1984 lex_entry_format = "%s ~= %s\n",
1985 word_separator=" ",phoneme_separator="",
1986 cleanup_regexps=[
1987 ("te0ye","tie"),
1988 ("e0e5","e5"),("([^aeiou][uo])0e(5)",r"\1\2"),
1989 ("yu0y","y"),
1990 ("wu0yo5","wo5"),
1991 ("([bdfghklmnpwz])[euo]0ei",r"\1ei"),
1992 ("([bdghklmnpstwz])[euo]0ai",r"\1ai"),
1993 ("([ghklmnpstyz])[euo]0ya",r"\1a"),("([ghklmnpstz])a([0-5]*)ne0",r"\1an\2"),
1994 ("([bdfghklmnpstwyz])[euo]0a([1-5])",r"\1a\2"),
1995 ("([bdjlmnpt])[euo]0yi",r"\1i"),("([bjlmnp])i([1-5]*)ne0",r"\1in\2"),
1996 ("([zs])he0ei",r"\1hei"),
1997 ("([dfghklmnprstyz])[euo]0ou",r"\1ou"),
1998 ("([dghklnrst])[euo]0huo",r"\1uo"),
1999 ("([bfpm])[euo]0huo",r"\1o"),
2000 ("([bdghklmnprstyz])[euo]0ao",r"\1ao"),
2001 ("([zcs])h[eu]0ao",r"\1hao"),
2002 ("re0r","r"),
2003 ("zhe0ne0","zhun5"),
2004 ("54","4"),
2005 ("52","2"),
2006 ("([bdjlmnpty])i([1-9])eng0",r"\1ing\2"),
2007 ("ya([1-9])eng0",r"yang\1"),
2008 ("ya([1-9])ne0",r"an\1"),
2009 ("ye([1-9])ne0",r"yan\1"),("([wr])[eu]0yan",r"\1en"),
2010 ("yi([1-9])ne0",r"yin\1"),
2011
2012 ("yu0","yu5"),("eng0","eng5"), # they won't work unvoiced anyway
2013 ("0","5"), # comment out if the synth supports 'tone 0 for unvoiced'
2014 #("[euo]0","0"), # comment in if it expects consonants only when doing that
2015 ],
2016 ),
2017
2018 "kana-approx" : makeDic(
2019 "Rough approximation using kana (for getting Japanese computer voices to speak some English words; works with some words better than others). Set KANA_TYPE environment variable to hiragana or katakana (which can affect the sounds of some voices); default is hiragana", # for example on Mac OS 10.7+ (with Japanese voice installed in System Preferences) try PHONES_PIPE_COMMAND='say -v Kyoko' (this voice has a built-in converter from English as well, but lexconvert --phones kana-approx can work better with some complex words, although the built-in converter does seem to have access to slightly more phonemes and can therefore produce words like "to" better). Default is hiragana because I find hiragana easier to read than katakana, although the Kyoko voice does seem to be able to say 'v' a little better when using kata. Mac OS 10.7+'s Korean voices (Yuna and Narae) can also read kana, and you could try doing a makeVariantDic and adding in some Korean jamo letters for them (you'd be pushed to represent everything in jamo but kana+jamo seems more hopeful in theory), but again some words work better than others (not all phonetic combinations are supported and some words aren't clear at all).
2020 # This kana-approx format is 'write-only' for now (see comment in cleanup_regexps re possible reversal)
2021 (u'\u30fc',primary_stress),
2022 (secondary_stress,ifset('KANA_MORE_EMPH',u'\u30fc'),False), # set KANA_MORE_EMPH environment variable if you want to try doubling the secondary-stressed vowels as well (doesn't always work very well; if it did, I'd put this line in a makeVariantDic called kana-approx-moreEmph or something)
2023 # The following Unicode codepoints are hiragana; KANA_TYPE is handled by cleanup_func below
2024 (u'\u3042',a_as_in_apple),
2025 (u'\u3044',e_as_in_eat),
2026 (u'\u3046',oo_as_in_food),
2027 (u'\u3048',e_as_in_them),
2028 (u'\u304a',o_as_in_orange),
2029 (u'\u3042\u3044',eye), # ai
2030 (u'\u3042\u304a',o_as_in_now), # ao
2031 (u'\u3048\u3044',a_as_in_ate), # ei
2032 (u'\u304a\u3044',oy_as_in_toy), # oi
2033 (u'\u304a\u3046',o_as_in_go), # ou
2034 (a_as_in_ah,u'\u3042',False),
2035 (a_as_in_ago,u'\u3046\u304a',False), # TODO: \u3042, \u304a or \u3046 depending on the word?
2036 (e_as_in_herd,u'\u3042',False), # TODO: really?
2037 (i_as_in_it,u'\u3044',False), # TODO: really?
2038 (u_as_in_but,u'\u3046',False), # TODO: really?
2039 (ar_as_in_year,u'\u3048',False), # TODO: really?
2040 (ear,u'\u3044\u304a',False), # TODO: really?
2041 (a_as_in_air,u'\u3048',False), # TODO: really?
2042 (oor_as_in_poor,u'\u304a',False), # TODO: really?
2043 (close_to_or,u'\u304a\u30fc'), # TODO: really?
2044 (u'\u3076',b), # bu (with vowel replacements later)
2045 (u'\u3061\u3047',ch), # chu (ditto)
2046 (u'\u3065',d), # du (and so on)
2047 (u'\u3066\u3085',th_as_in_think), (th_as_in_them,u'\u3066\u3085',False),
2048 (u'\u3075',f),
2049 (u'\u3050',g),
2050 (u'\u306f',h), # ha (as hu == fu)
2051 (u'\u3058\u3085',j_as_in_jump), (ge_of_blige_etc,u'\u3058\u3085',False),
2052 (u'\u304f',k),
2053 (u'\u308b',l), (r,u'\u308b',False),
2054 (u'\u3080',m),
2055 (u'\u306c',n),
2056 (u'\u3093\u3050',ng),
2057 (u'\u3077',p),
2058 (u'\u3059',s),
2059 (u'\u3057\u3085',sh),
2060 (u'\u3064',t),
2061 (u'\u308f',w), # use 'wa' (as 'wu' == 'u')
2062 (v,ifset('KANA_V_AS_W',u'\u308f',u'\u3094'),False), # TODO: document KANA_V_AS_W variable. Is vu always supported? (it doesn't seem to show up in all fonts)
2063 (u'\u3086',y),
2064 (u'\u305a',z),
2065 lex_filename="words-kana-approx.txt",
2066 lex_type = "text",
2067 lex_header = "Kana approxmations (very approximate!)\n--------------------------------------\n",
2068 lex_entry_format = "%s ~= %s\n",
2069 word_separator=" ",phoneme_separator="",
2070 clause_separator=u"\u3002\n".encode('utf-8'),
2071 cleanup_regexps=[(u"\u306c$",u"\u3093\u30fc"), # TODO: or u"\u3093\u3093" ?
2072 # now the vowel replacements (bu+a -> ba, etc) (in most cases these can be reversed into cvtOut_regexps if you want to use the kana-approx table to convert hiragana into approximate English phonemes (plus add a (u"\u3093\u30fc*",u"\u306c") and perhaps de-doubling rules to convert back to emphasis) but the result is unlikely to be any good)
2073 (u"\u3076\u3042",u"\u3070"),(u"\u3076\u3044",u"\u3073"),(u"\u3076\u3048",u"\u3079"),(u"\u3076\u304a",u"\u307c"),(u"\u3076\u3046",u"\u3076"),
2074 (u"\u3061\u3085\u3042",u"\u3061\u3083"),(u"\u3061\u3085\u3046",u"\u3061\u3085"),(u"\u3061\u3085\u3048",u"\u3061\u3047"),(u"\u3061\u3085\u304a",u"\u3061\u3087"),(u"\u3061\u3085\u3044",u"\u3061"),
2075 (u"\u3065\u3042",u"\u3060"),(u"\u3065\u3044",u"\u3062"),(u"\u3065\u3048",u"\u3067"),(u"\u3065\u304a",u"\u3069"),(u"\u3065\u3046",u"\u3065"),
2076 (u"\u3066\u3085\u3042",u"\u3066\u3083"),(u"\u3066\u3085\u3044",u"\u3066\u3043"),(u"\u3066\u3043\u3046",u"\u3066\u3085"),(u"\u3066\u3085\u3048",u"\u3066\u3047"),(u"\u3066\u3085\u304a",u"\u3066\u3087"),
2077 (u"\u3075\u3042",u"\u3075\u3041"),(u"\u3075\u3044",u"\u3075\u3043"),(u"\u3075\u3048",u"\u3075\u3047"),(u"\u3075\u304a",u"\u3075\u3049"),(u"\u3075\u3046",u"\u3075"),
2078 (u"\u306f\u3044",u"\u3072"),(u"\u306f\u3046",u"\u3075"),(u"\u306f\u3048",u"\u3078"),(u"\u306f\u304a",u"\u307b"),(u"\u306f\u3042",u"\u306f"),
2079 (u"\u3050\u3042",u"\u304c"),(u"\u3050\u3044",u"\u304e"),(u"\u3050\u3048",u"\u3052"),(u"\u3050\u304a",u"\u3054"),(u"\u3050\u3046",u"\u3050"),
2080 (u"\u3058\u3085\u3042",u"\u3058\u3083"),(u"\u3058\u3085\u3046",u"\u3058\u3085"),(u"\u3058\u3085\u3048",u"\u3058\u3047"),(u"\u3058\u3085\u304a",u"\u3058\u3087"),(u"\u3058\u3085\u304a",u"\u3058"),
2081 (u"\u304f\u3042",u"\u304b"),(u"\u304f\u3044",u"\u304d"),(u"\u304f\u3048",u"\u3051"),(u"\u304f\u304a",u"\u3053"),(u"\u304f\u3046",u"\u304f"),
2082 (u"\u308b\u3042",u"\u3089"),(u"\u308b\u3044",u"\u308a"),(u"\u308b\u3048",u"\u308c"),(u"\u308b\u304a",u"\u308d"),(u"\u308b\u3046",u"\u308b"),
2083 (u"\u3080\u3042",u"\u307e"),(u"\u3080\u3044",u"\u307f"),(u"\u3080\u3048",u"\u3081"),(u"\u3080\u304a",u"\u3082"),(u"\u3080\u3046",u"\u3080"),
2084 (u"\u306c\u3042",u"\u306a"),(u"\u306c\u3044",u"\u306b"),(u"\u306c\u3048",u"\u306d"),(u"\u306c\u304a",u"\u306e"),(u"\u306c\u3046",u"\u306c"),
2085 (u"\u3077\u3042",u"\u3071"),(u"\u3077\u3044",u"\u3074"),(u"\u3077\u3048",u"\u307a"),(u"\u3077\u304a",u"\u307d"),(u"\u3077\u3046",u"\u3077"),
2086 (u"\u3059\u3042",u"\u3055"),(u"\u3059\u3048",u"\u305b"),(u"\u3059\u304a",u"\u305d"),(u"\u3059\u3046",u"\u3059"),
2087 (u"\u3057\u3085\u3042",u"\u3057\u3083"),(u"\u3057\u3085\u3046",u"\u3057\u3085"),(u"\u3057\u3085\u3048",u"\u3057\u3047"),(u"\u3057\u3085\u304a",u"\u3057\u3087"),(u"\u3057\u3085\u3044",u"\u3057"),
2088 (u"\u3064\u3042",u"\u305f"),(u"\u3064\u3044",u"\u3061"),(u"\u3064\u3048",u"\u3066"),(u"\u3064\u304a",u"\u3068"),(u"\u3064\u3046",u"\u3064"),
2089 (u"\u3086\u3042",u"\u3084"),(u"\u3086\u3048",u"\u3044\u3047"),(u"\u3086\u304a",u"\u3088"),(u"\u3086\u3046",u"\u3086"),
2090 (u"\u305a\u3042",u"\u3056"),(u"\u305a\u3044",u"\u3058"),(u"\u305a\u3048",u"\u305c"),(u"\u305a\u304a",u"\u305e"),(u"\u305a\u3046",u"\u305a"),
2091 (u"\u308f\u3044",u"\u3046\u3043"),(u"\u308f\u3046",u"\u3046"),(u"\u308f\u3048",u"\u3046\u3047"),(u"\u308f\u304a",u"\u3092"),(u"\u308f\u3042",u"\u308f"),
2092 (u'\u3046\u3043\u3066\u3085', u'\u3046\u3043\u3065'), # sounds a bit better for words like 'with'
2093 (u'\u3085\u3046',u'\u3085'), # and 'the' (especially with a_as_in_ago mapping to u'\u3046\u304a'; it's hard to get a convincing 'the' though, especially in isolation)
2094 (u'\u3050\u3050',u'\u3050'), # gugu -> gu, sometimes comes up with 'gl-' combinations
2095 (u'\u30fc\u30fc+',u'\u30fc'), # in case we put 30fc in the table AND a stress mark has been applied to it
2096 (u'^(.)$',u'\\1\u30fc'), # lengthen any word that ends up as a single kana (otherwise can be clipped badly)
2097 (u'^([\u3042\u3070\u3060\u304c\u304b\u3089\u307e\u306a\u3071\u3055\u305f\u3084\u3056\u308f]\u3044)$',u'\\1\u30fc'), # ditto for -ai (TODO: -ao might need lengthening sometimes?? depends on context. -ei, -oi, -ou seem OK)
2098 ],
2099 cleanup_func = hiragana_to_katakana
2100 ),
2101
2102 "deva-approx" : makeDic(
2103 "Rough approximation using Devanagari (for getting Indian computer voices to speak some English words; works with some words better than others); can also be used to approximate Devanagari words in English phonemes",
2104 (u'\u02c8',primary_stress),
2105 (u'\u093e',a_as_in_ah),(u'\u0906',a_as_in_ah,False),
2106 (u'\u0905',u_as_in_but),
2107 (u'\u092c',b),
2108 (u'\u091b',ch),(u'\u091a',ch,False),
2109 (u'\u0926',d),(u'\u0921',d,False), # TODO: check which sounds better for reading English words
2110 (u'\u0920',th_as_in_them), # (very approximate)
2111 (u'\u0948',e_as_in_them),(u'\u0910',e_as_in_them,False),
2112 (u'\u0947',a_as_in_ate),(u'\u090f',a_as_in_ate,False),
2113 (u'\u092b\u093c',f),
2114 (u'\u0917',g),
2115 (u'\u0917\u093c',g,False), # (Hindi; differs in others)
2116 (u'\u0939',h),(u'\u0903',h,False),
2117 (u'\u093f',i_as_in_it),(u'\u0907',i_as_in_it,False),
2118 (u'\u0940',e_as_in_eat),(u'\u0908',e_as_in_eat,False),
2119 (u'\u091c',j_as_in_jump),
2120 (u'\u0915',k),(u'\u0916',k,False),
2121 (u'\u0916\u093c',opt_scottish_loch),
2122 (u'\u0915\u093c',opt_scottish_loch,False), # ?
2123 (u'\u0932',l),
2124 (u'\u092e',m),
2125 (u'\u0928',n),(u'\u0923',n,False),
2126 (u'\u0902',ng),
2127 (u'\u092a',p),
2128 (u'\u092b',f,False), # (Hindi; p in some others?)
2129 (u'\u0930',r),(u'\u0921\u093c',r,False),
2130 (u'\u0938',s),
2131 (u'\u0936',sh), (u'\u0937',sh,False),
2132 (u'\u091f',t),(u'\u0924',t,False),(u'\u0925',t,False),
2133 (u'\u0941',opt_u_as_in_pull),(u'\u0909',opt_u_as_in_pull,False),
2134 (u'\u0942',oo_as_in_food),(u'\u090a',oo_as_in_food,False),
2135 (u'\u094c',close_to_or),(u'\u0914',close_to_or,False),
2136 (u'\u094b',opt_ol_as_in_gold),(u'\u0913',opt_ol_as_in_gold,False),
2137 (u'\u0935',v),(w,u'\u0935',False),
2138 (u'\u092f',y),
2139 (u'\u091c\u093c',z),
2140 (u'\u091d\u093c',ge_of_blige_etc),
2141 (u'\u0901',ipa_colon),
2142 word_separator=" ",phoneme_separator="",
2143 stress_comes_before_vowel=True,
2144 safe_to_drop_characters=True, # it's an approximation
2145 approximate_missing=True,
2146 cleanup_regexps=[
2147 # add virama if consonant not followed by vowel, and delete default vowel after consonant:
2148 (u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u094d'),(u'(?<=[\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0905',u''),(u'(.)\u094d\u02c8',u'\u02c8\\1'),
2149 # replace vowel signs with vowel letters if not preceded by consonants:
2150 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093e',u'\u0906'),
2151 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093f',u'\u0907'),
2152 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0940',u'\u0908'),
2153 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0941',u'\u0909'),
2154 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0942',u'\u090a'),
2155 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0947',u'\u090f'),
2156 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0948',u'\u0910'),
2157 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094b',u'\u0913'),
2158 (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094c',u'\u0914')],
2159 cvtOut_func=unicode_preprocess,
2160 cvtOut_regexps=[
2161 # add default vowel when necessary:
2162 (u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u094d\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u0905'),(u'\u094d',u''),
2163 # 'add h' approximations:
2164 (u'\u092d',u'\u092c\u0939'),(u'\u0927',u'\u0922\u0939'),(u'\u0918',u'\u0917\u0939'),(u'\u091d',u'\u091c\u0939'),(u'\u0922\u093c',u'\u0921\u093c\u0939'),
2165 ]),
2166
2167 "names" : makeDic(
2168 "Lexconvert internal phoneme names (sometimes useful with the --phones option while developing new formats)",
2169 *[(phName,phVal) for phName,phVal in phonemes.items()])}
2170
2171 # The mainopt_...() functions are the main options
2172 # (if you implement a new one, main() will detect it);
2173 # 1st line of doc string should be parameter summary
2174 # (start the doc string with \n if no parameters); if 1st
2175 # character of doc string is * then this function is put
2176 # among the first in the help (otherwise alphabetically).
2177 # If function returns a string, that's taken to be a
2178 # message to be printed with error exit. Same if it raises
2179 # an exception of type Message.
2180
2181 def mainopt_try(i):
2182 """*<format> [<pronunciation>]
2183 Convert input from <format> into eSpeak and try it out.
2184 (Requires the 'espeak' command.)
2185 E.g.: python lexconvert.py --try festival h @0 l ou1
2186 or: python lexconvert.py --try unicode-ipa '\\u02c8\\u0279\\u026adn\\u0329' (for Unicode put '\\uNNNN' or UTF-8)"""
2187 format = sys.argv[i+1]
2188 if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2189 for phones in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2190 espeak = convert(phones,format,'espeak')
2191 w = os.popen("espeak -x","w")
2192 getBuf(w).write(markup_inline_word("espeak",espeak)+as_utf8('\n')) # separate process each item for more responsiveness from the console (sending 'maybe' to getInputText means won't lose efficiency if not read from console)
2193
2194 def mainopt_trymac(i):
2195 """*<format> [<pronunciation>]
2196 Convert phonemes from <format> into Mac and try it using the Mac OS 'say' command"""
2197 format = sys.argv[i+1]
2198 if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2199 for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2200 mac = convert(resp,format,'mac')
2201 toSay = markup_inline_word("mac",mac)
2202 print(as_printable(toSay))
2203 w = os.popen(macSayCommand()+" -v Vicki","w")
2204 getBuf(w).write(toSay) # Need to specify a voice because the default voice might not be able to take Apple phonemes. Vicki has been available since 10.3, as has the 'say' command (previous versions need osascript, see Gradint's code)
2205
2206 def mainopt_trymac_uk(i):
2207 """*<format> [<pronunciation>]
2208 Convert phonemes from <format> and try it with Mac OS British voices (see --mac-uk for details)"""
2209 assert sys.version_info[0]==2, "--trymac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"
2210 format = sys.argv[i+1]
2211 if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2212 for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2213 macuk = convert(resp,format,'mac-uk')
2214 m = MacBritish_System_Lexicon("",os.environ.get("MACUK_VOICE","Daniel"))
2215 try:
2216 try: m.speakPhones(macuk.split())
2217 finally: m.close()
2218 except KeyboardInterrupt:
2219 sys.stderr.write("Interrupted\n")
2220
2221 def mainopt_phones(i):
2222 """*<format> [<words>]
2223 Use eSpeak to convert text to phonemes, and then convert the phonemes to format 'format'.
2224 E.g.: python lexconvert.py --phones unicode-ipa This is a test sentence.
2225 Set environment variable PHONES_PIPE_COMMAND to an additional command to which to write the phones as well as standard output. (If standard input is a terminal then this will be done separately after each line.)
2226 (Some commercial speech synthesizers do not work well when driven entirely from phonemes, because their internal format is different and is optimised for normal text.)
2227 Set format to 'all' if you want to see the phonemes in ALL supported formats."""
2228 format = sys.argv[i+1]
2229 if format=="example": return "The 'example' format cannot be used with --phones; try --convert, or did you mean --phones festival" # could allow example anyway as it's basically Festival, but save confusion as eSpeak might not generate the same phonemes if our example words haven't been installed in the system's eSpeak. (Still allow it to be used in --try etc though.)
2230 if not format in lexFormats and not format=="all": return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2231 hadOneoff = False
2232 for response in getInputText(i+2,"text",'maybe'):
2233 response = pipeThroughEspeak(as_utf8(response).replace(u'\u2032'.encode('utf-8'),as_utf8('')).replace(u'\u00b4'.encode('utf-8'),as_utf8('')).replace(u'\u02b9'.encode('utf-8'),as_utf8('')).replace(u'\u00b7'.encode('utf-8'),as_utf8(''))) # (remove any 2032 and b7 pronunciation marks before passing to eSpeak)
2234 if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2235 if format=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")
2236 else: formats = [format]
2237 for format in formats:
2238 def out(doOneoff=True):
2239 if len(formats)>1: writeFormatHeader(format)
2240 if doOneoff: getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_oneoff_header")))
2241 getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_header")))
2242 output_clauses(format,convert(parseIntoWordsAndClauses("espeak",response),"espeak",format))
2243 getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_footer")))
2244 print("")
2245 sys.stdout.flush() # in case it's being piped
2246 out(not hadOneoff) ; hadOneoff = True
2247 if os.environ.get("PHONES_PIPE_COMMAND",""):
2248 o,sys.stdout = sys.stdout,os.popen(os.environ["PHONES_PIPE_COMMAND"],'w')
2249 out()
2250 sys.stdout = o
2251
2252 def mainopt_ruby(i):
2253 """*<format> [<words>]
2254 Like --phones but outputs the result as HTML RUBY markup, with each word's pronunciation symbols placed above the corresponding English word.
2255 E.g.: python lexconvert.py --ruby unicode-ipa This is a test sentence.
2256 This option is made more complicated by the fact that different versions of eSpeak may space the phoneme output differently, for example when handling numbers; if your eSpeak version is not recognised then all numbers are unannotated. Anyway you are advised not to rely on this option working with the new development NG versions of eSpeak. If the version you have behaves unexpectedly, words and phonemes output might lose synchronisation. However this option is believed to be stable when used with simple text and the original eSpeak.
2257 You can optionally set the RUBY_GRADINT_CGI environment variable to the URL of an instance of Gradint Web Edition to generate audio links for each word. If doing this in a Web Adjuster filter, see comments in the lexconvert source for setup details."""
2258 # htmlFilter with --htmlText of course. Set separator to two newlines and copy the generated 'h5a' function (from a manual run or the lexconvert source) into Adjuster's headAppend option (but don't expect HTML5 audio to work from Adjuster's submitBookmarklet option; pronunciation links will take you off the page if it doesn't).
2259 # Use double newlines as single newlines are used in the h5a script; adding that script via bookmarklet doesn't always run it
2260 format = sys.argv[i+1]
2261 if format=="example": return "The 'example' format cannot be used with --ruby; did you mean festival?" # as above
2262 elif format=="all": return "The --phones all option cannot be used with --ruby" # (well you could implement it if you want but the resulting ruby would be quite unwieldy)
2263 if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2264 text = as_utf8(getInputText(i+2,"text")).replace(u'\u2019'.encode('utf-8'),as_utf8("'")).replace(u'\u2032'.encode('utf-8'),as_utf8("'")).replace(u'\u00b4'.encode('utf-8'),as_utf8("'")).replace(u'\u02b9'.encode('utf-8'),as_utf8("'")).replace(u'\u00b7'.encode('utf-8'),as_utf8('')).replace(u'\u00a0'.encode('utf-8'),as_utf8(' '))
2265 # eSpeak's basic idea of an alphabetical word (most versions?) -
2266 wordRegexps = [r"(?:[A-Z]+['?-])*(?:(?:(?<![A-z.])(?:[A-z]\.)+[A-z](?![A-z.]))|[A-Z]+[a-z](?![A-z])|[A-Z][A-Z]+(?![a-z][A-z])|[A-Z]?(?:[a-z]['?-]?)+|[A-Z])"]
2267 # A dot, when not part of an elipses, followed by a letter is pronounced "dot", and two of them are pronounced "dot dot":
2268 wordRegexps.append(r"(?<!\.\.)\.(?=[A-z])|(?<!\.)\.(?=\.[A-z])")
2269 # ! followed by a letter is pronounced "exclamation", and .! is "dotexclamation"; @ symbols similarly; copyright
2270 atEtc = u"(?:[@!:]|\u00a9)*".encode('utf-8')
2271 wordRegexps.append(as_utf8(r"\.?[!@]+(?=[A-z])|(?<![A-z])@")+atEtc+as_utf8("(?![A-z])|")+unichr(0xa9).encode('utf-8')+atEtc)
2272 # : between numbers if NOT followed by 2 digits:
2273 wordRegexps.append(r"(?<![A-z]):(?![A-z]|[0-9][0-9])")
2274 # - between numbers
2275 wordRegexps.append(r"(?<=[0-9])-(?=[0-9])")
2276 # TODO: if you paste in (e.g.) CJK characters, eSpeak will say "symbol-symbol-symbol" etc, but this is not accounted for by the above regexp so it'll go onto following words.
2277 vLine = espeak_version_line()
2278 if "1.45." in vLine:
2279 # This seems to work in eSpeak 1.45:
2280 # (TODO: test leading 0s & leading decimal)
2281 # a number of 4 digits or less (with any number of digits after the decimal point) is grouped as 1 word:
2282 wordRegexps.append(r"(?<![0-9])[0-9]{1,4}(?:\.[0-9]+)?(?!,?[0-9])")
2283 # and a number of 1 to 3 digits with any number of 000 or ,000 groups, with optional decimal point followed by any number of digits, OR when placed before an integer number of 3-digit groups, is grouped as 1 word:
2284 wordRegexps.append(r"[0-9]{1,3}(?:,?000)*(?:\.[0-9]+)?,?(?=(?:,?[0-9]{3,3})*,?(?:[^0-9]|$))")
2285 text2 = text
2286 elif "1.48." in vLine:
2287 # In eSpeak 1.48 the groups are smaller.
2288 # Decimal point and everything after it = individual
2289 wordRegexps.append(r"(?<=[0-9])\.(?=[0-9])")
2290 for places in range(25): # TODO: really want unbounded, but (?<=...) is fixed-length
2291 wordRegexps.append(r"(?<=[0-9]\."+"[0-9]"*places+r")[0-9]")
2292 # Number with a leading dot grouped as 1 word:
2293 wordRegexps.append(r"(?<![0-9])\.[0-9]+")
2294 # TODO: leading 0s (0000048 goes to 0 000 048)
2295 # For normal numbers:
2296 # A null string w. 3 or 6 digits to go and digits b4 shld match for 'thousand', 'million' (unless 3+ digits are leading 0s, or fewer than 3 leading 0s and whole thing begins with a 0, or it's part of a decimal expansion, in which case different rules apply, but (?<=...) must be fixed-length, so we need another one of these awful loops) :
2297 for prevDigits in range(10):
2298 for beforeThat in ["^",r"[^.0-9,]"]: # beginning of string, or something OTHER than a decimal point / num
2299 wordRegexps.append(r"(?<="+beforeThat+"[1-9]"+"[0-9,]"*prevDigits+r")(?<!,)(?<!000)(?# empty string )(?=(?:,?(?:[0-9]{3,3}))+(?:[^0-9]|$))")
2300 # 1-9 (not 0) with 2, 5 or 8 etc digits to go = "N-hundred-and" :
2301 wordRegexps.append(r"[1-9](?=[0-9][0-9](?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")
2302 # + 0 with 2 digits to go when preceded by digits = "and", as long as followed by at least one non-0:
2303 wordRegexps.append(r"(?<=[0-9,])0(?=(?:[0-9][1-9]|[1-9][0-9])(?:[^0-9,]|$))")
2304 # 1 or 2 digits with 0,3,6.. to go = "seventy-six" or whatever, as long as they're not both 0 :
2305 wordRegexps.append(r"(?:0[1-9]|[1-9][0-9]?)(?=(?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")
2306 # 0 by itself (not preceded by digits) = "nought" :
2307 wordRegexps.append(r"(?<![0-9])0(?=[^0-9]|$)")
2308 wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:of|on|in|that|with|for|was) (?:the|a)(?= )")
2309 wordRegexps.insert(0,"(?:Of|On|In|That|With|For|Was) (?:the|a)(?= )")
2310 wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])not a(?= )")
2311 wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:some|that) one(?= )")
2312 wordRegexps.insert(0,"(?:Some|That) one(?= )")
2313 text2 = text
2314 else: text2 = re.sub(r"\.?[0-9]+","",text) # unknown eSpeak version: don't annotate the numbers
2315 response = pipeThroughEspeak(text2)
2316 if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2317 gradint_cgi = os.environ.get("RUBY_GRADINT_CGI","")
2318 if gradint_cgi:
2319 linkStart,linkEnd = lambda w:maybe_bytes('<a href="',w)+maybe_bytes(gradint_cgi,w)+maybe_bytes('?js=[[',w)+w.replace(maybe_bytes('%',w),maybe_bytes('%25',w)).replace(maybe_bytes('&',w),maybe_bytes('%26',w))+maybe_bytes(']]&jsl=en" onclick="return h5a(this);">',w), '</a>'
2320 print(r"""<script><!-- // HTML5-audio function
2321 function h5a(link) {
2322 if (document.createElement) {
2323 var ae = document.createElement('audio');
2324 if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/mpeg'))) {
2325 ae.setAttribute('src', link.href);
2326 ae.play(); return false;
2327 } else if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/ogg'))) {
2328 ae.setAttribute('src', link.href+"&filetype=ogg");
2329 ae.play(); return false; }
2330 } return true; }
2331 //--></script>""")
2332 else: linkStart,linkEnd = lambda w:maybe_bytes("",w), ""
2333 rubyList = []
2334 for clause in parseIntoWordsAndClauses("espeak",response):
2335 for w in clause:
2336 converted = convert(w,"espeak",format)
2337 if not converted: continue # e.g. a lone _:_:
2338 m = markup_inline_word(format,converted)
2339 rubyList.append(linkStart(w)+m.replace(maybe_bytes("&",m),maybe_bytes("&amp;",m)).replace(maybe_bytes("<",m),maybe_bytes("&lt;",m))+maybe_bytes(linkEnd,w))
2340 rubyList.reverse() # so can pop() left-to-right order
2341 # Write out re.sub ourselves, because (1) some versions of the library (e.g. on 2.7.12) try to do some things in-place, and we're using previous-context regexps that aren't compatible with previous things having been already <ruby>'ified, and (2) if we match a 0-length string, re.finditer won't ALSO return a non-0 length match starting in the same place, and we want both (so we're using wordRegexps as a list rather than an | expression)
2342 matches = {}
2343 debug = False # if True, will add ruby title=(index of the regexp that matched)
2344 debugCount = 0
2345 for r in wordRegexps:
2346 for match in re.finditer(maybe_bytes(r,text),text):
2347 matches[(match.start(),match.end())] = debugCount
2348 debugCount += 1
2349 i = 0 ; r = []
2350 def cmpFunc(a,b):
2351 (s1,e1),(s2,e2) = a,b
2352 if s1<s2: return -1
2353 if s1>s2: return 1
2354 if e1>e2: return -1
2355 if e1<e2: return 1
2356 return 0
2357 for start,end in sorted(list(matches.keys()),cmpFunc):
2358 if start<i: continue # overlap??
2359 r.append(text[i:start])
2360 if start==end: m = "&nbsp;"
2361 else: m = text[start:end].replace(maybe_bytes("&",text),maybe_bytes("&amp;",text)).replace(maybe_bytes("<",text),maybe_bytes("&lt;",text))
2362 try: rt = rubyList.pop()
2363 except: rt = "ERROR" # we've lost synchronisation
2364 if debug: title = as_utf8(" title=")+as_utf8(str(matches[(start,end)]))
2365 else: title = as_utf8("")
2366 r.append(as_utf8("<ruby")+title+as_utf8("><rb>")+m+as_utf8("</rb><rt>")+rt+as_utf8("</rt></ruby>"))
2367 i = end
2368 r.append(text[i:])
2369 while rubyList: # oops, lost synchronisation the other way (TODO: show this per-paragraph? but don't call eSpeak too many times if processing many short paragraphs)
2370 r.append(as_utf8("<ruby><rb>ERROR</rb><rt>")+rubyList.pop()+as_utf8("</rt></ruby>"))
2371 out = as_utf8("").join(r)
2372 if not out.endswith(as_utf8("\n")): out += as_utf8("\n")
2373 getBuf(sys.stdout).write(out)
2374
2375 def pipeThroughEspeak(inpt):
2376 "Writes inpt to espeak -q -x (in chunks if necessary) and returns the result"
2377 assert type(inpt)==bytes
2378 bufsize = 8192 # careful not to set this too big, as the OS might limit it (TODO can we check?)
2379 ret = []
2380 while len(inpt) > bufsize:
2381 splitAt = inpt.rfind('\n',0,bufsize)+1
2382 if not splitAt: # no newline, try to split on space
2383 splitAt = inpt.rfind(' ',0,bufsize)+1
2384 if not splitAt:
2385 sys.stderr.write("Note: had to split eSpeak input and couldn't find a newline or space to do it on\n")
2386 splitAt = bufsize
2387 response = pipeThroughEspeak(inpt[:splitAt])
2388 if not '\n' in response.rstrip() and 'command' in response: return response.strip() # 'bad cmd' / 'cmd not found'
2389 ret.append(response) ; inpt=inpt[splitAt:]
2390 try: w,r=os.popen4("espeak -q -x",bufsize=bufsize) # Python 2
2391 except AttributeError: # Python 3
2392 import subprocess
2393 proc=subprocess.Popen(['espeak','-q','-x'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
2394 w = proc.stdin
2395 r = None
2396 if r:
2397 getBuf(w).write(inpt) ; w.close()
2398 r = getBuf(r).read()
2399 else: # Python 3
2400 w.write(inpt)
2401 out,err=proc.communicate()
2402 r = as_utf8("")
2403 if out: r += out
2404 if err: r += err
2405 return as_utf8("\n").join(ret) + r
2406
2407 def espeak_version_line(): return os.popen("espeak -h 2>&1").read().strip().split("\n")[0]
2408
2409 def writeFormatHeader(format):
2410 "Writes a header for 'format' when outputting in all formats. Assumes the output MIGHT end up being more than one line."
2411 global writeFormatHeader_called
2412 if writeFormatHeader_called: print("")
2413 print(format)
2414 print('-'*len(format))
2415 writeFormatHeader_called = True
2416 writeFormatHeader_called = False
2417
2418 def mainopt_check_variants(i):
2419 # undocumented (won't appear in help text)
2420 groups = {}
2421 for k,v in lexFormats['espeak'].items():
2422 if type(k)==str:
2423 intV = int(v)
2424 if not intV in consonants:
2425 groups.setdefault(intV,[]).append((v,k))
2426 i = groups.items() ; i.sort()
2427 for k,v in i:
2428 if len(v)==1: continue
2429 v.sort()
2430 while True:
2431 print("Group "+str(k))
2432 es = os.popen("espeak -x","w")
2433 getBuf(es).write(as_utf8('\n').join([markup_inline_word("espeak",w) for _,w in v]))
2434 del es
2435 if not int(str(input("Again? 1/0: "))): break
2436
2437 def mainopt_check_for_similar_formats(i):
2438 # undocumented (won't appear in help text)
2439 items = lexFormats.items() ; r = []
2440 while items:
2441 k1,dic1 = items[0]
2442 for k2,dic2 in items[1:]:
2443 diff = 0
2444 for kk,vv in dic1.items():
2445 if not type(kk)==int: continue
2446 if kk==syllable_separator: continue
2447 if not dic2.get(kk,"!"+vv)==vv: diff += 1
2448 r.append((diff,k1,k2))
2449 items = items[1:]
2450 r.sort() ; had = set()
2451 for diffs,format1,format2 in r:
2452 if format1 in had and format2 in had: continue
2453 had.add(format1) ; had.add(format2)
2454 if "names" in had: break
2455 print(str(diffs)+" phoneme differences between "+format1+" and "+format2)
2456
2457 def festival_group_stress(pronunc):
2458 "Special-case cleanup_func for the Festival format"
2459 # TODO: do we ever need to add extra consonants to the
2460 # previous group instead of the next group? (not sure
2461 # what difference it makes to the synthesis, but it
2462 # might make the entry a bit more readable)
2463 groups = [] ; thisGroup = [[],'0',False] # phon,stress,complete
2464 for phon in pronunc.split():
2465 if phon in ['0','1','2']:
2466 if groups and phon >= groups[-1][1]:
2467 groups[-1][1]=phon
2468 continue
2469 thisGroup[0].append(phon)
2470 if phon[:1] in 'aeiou@':
2471 thisGroup[2]=True
2472 groups.append(thisGroup)
2473 thisGroup = [[],'0',False]
2474 if thisGroup[0]: groups.append(thisGroup)
2475 if len(groups)>=2 and not groups[-1][2]:
2476 groups[-2][0] += groups[-1][0]
2477 del groups[-1]
2478 return "("+' '.join(("(("+' '.join(g[0])+') '+g[1]+")") for g in groups)+")"
2479
2480 def mainopt_convert(i):
2481 """*<from-format> <to-format>
2482 Convert a user lexicon (generally from its default filename; if this cannot be found then lexconvert will tell you what it should be).
2483 E.g.: python lexconvert.py --convert festival cepstral"""
2484 fromFormat = sys.argv[i+1]
2485 toFormat = sys.argv[i+2]
2486 if fromFormat==toFormat: return "Cannot convert a lexicon to its own format (that could result in it being truncated)"
2487 if toFormat=="mac-uk": return "Cannot permanently save a Mac-UK lexicon; please use the --mac-uk option to read text"
2488 if toFormat=="example": return "Cannot overwrite the built-in example lexicon"
2489 for f in [fromFormat,toFormat]:
2490 if not f in lexFormats: return "No such format "+repr(f)+" (use --formats to see a list of formats)"
2491 try:
2492 fname=getSetting(toFormat,"lex_filename")
2493 getSetting(toFormat,"lex_entry_format") # convert_user_lexicon will need this
2494 except KeyError: fname = None
2495 if not fname: return "Write support for lexicons of format '%s' not yet implemented (need at least lex_filename and lex_entry_format); try using --phones or --phones2phones options instead" % (toFormat,)
2496 if toFormat=="espeak":
2497 assert fname=="en_extra", "If you changed eSpeak's lex_filename in the table you also need to change the code below"
2498 if os.system("mv en_extra en_extra~ && (grep \" // \" en_extra~ || true) > en_extra"): sys.stderr.write("Warning: en_extra not found, making a new one\n(espeak compile will probably fail in this directory)\n") # otherwise keep the commented entries, so can incrementally update the user lexicon only
2499 outFile=open(fname,"a")
2500 else:
2501 l = 0
2502 try:
2503 f = open(fname)
2504 l = getBuf(f).read()
2505 del f
2506 except: pass
2507 assert not l, "File "+replHome(fname)+" already exists and is not empty; are you sure you want to overwrite it? (Delete it first if so)" # (if you run with python -O then this is ignored, as are some other checks so be careful)
2508 outFile=open(fname,"w")
2509 print ("Writing %s lexicon entries to %s file %s" % (fromFormat,toFormat,fname))
2510 try: convert_user_lexicon(fromFormat,toFormat,outFile)
2511 except Message:
2512 print (" - error, deleting "+fname)
2513 os.remove(fname) ; raise
2514
2515 def mainopt_festival_dictionary_to_espeak(i):
2516 """<location>
2517 Convert the Festival Oxford Advanced Learners Dictionary (OALD) pronunciation lexicon to eSpeak.
2518 You need to specify the location of the OALD file in <location>,
2519 e.g. for Debian festlex-oald package: python lexconvert.py --festival-dictionary-to-espeak /usr/share/festival/dicts/oald/all.scm
2520 or if you can't install the Debian package, try downloading http://ftp.debian.org/debian/pool/non-free/f/festlex-oald/festlex-oald_1.4.0.orig.tar.gz, unpack it into /tmp, and do: python lexconvert.py --festival-dictionary-to-espeak /tmp/festival/lib/dicts/oald/oald-0.4.out
2521 In all cases you need to cd to the eSpeak source directory before running this. en_extra will be overwritten. Converter will also read your ~/.festivalrc if it exists. (You can later incrementally update from ~/.festivalrc using the --convert option; the entries from the system dictionary will not be overwritten in this case.) Specify --without-check to bypass checking the existing eSpeak pronunciation for OALD entries (much faster, but makes a larger file and in some cases compromises the pronunciation quality)."""
2522 try: festival_location=sys.argv[i+1]
2523 except IndexError: return "Error: --festival-dictionary-to-espeak must be followed by the location of the festival OALD file (see help text)"
2524 try: open(festival_location)
2525 except: return "Error: The specified OALD location '"+festival_location+"' could not be opened"
2526 try: open("en_list")
2527 except: return "Error: en_list could not be opened (did you remember to cd to the eSpeak dictsource directory first?"
2528 convert_system_festival_dictionary_to_espeak(festival_location,not '--without-check' in sys.argv,not os.system("test -e ~/.festivalrc"))
2529
2530 def mainopt_syllables(i):
2531 """[<words>]
2532 Attempt to break 'words' into syllables for music lyrics (uses espeak to determine how many syllables are needed)"""
2533 # As explained on mainopt_ruby's help text, espeak -x output can't be relied on to always put a space between every input word. Rather than try to guess what espeak is going to do, here we simply put a newline after every input word instead. This might affect eSpeak's output (so not recommended for mainopt_ruby), but it should be OK for just counting the syllables. (Also, the assumption that the input words have been taken from song lyrics usefully rules out certain awkward punctuation cases.)
2534 for txt in getInputText(i+1,"word(s)",'maybe'):
2535 words=txt.split()
2536 response = pipeThroughEspeak(as_utf8('\n').join(as_utf8(w) for w in words).replace(as_utf8("!"),as_utf8("")).replace(as_utf8(":"),as_utf8("")).replace(as_utf8("."),as_utf8("")))
2537 if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2538 rrr = response.split(as_utf8("\n"))
2539 print (" ".join([hyphenate(word,sylcount(convert(line,"espeak","example"))) for word,line in zip(words,filter(lambda x:x,rrr))]))
2540 sys.stdout.flush() # in case piped
2541
2542 def wordSeparator(format):
2543 """Returns the effective word separator of format (remembering that it defaults to same as phoneme_separator"""
2544 return checkSetting(format,"word_separator",checkSetting(format,"phoneme_separator"," "))
2545
2546 def mainopt_phones2phones(i):
2547 """*<format1> <format2> [<phonemes in format1>]
2548 Perform a one-off conversion of phonemes from format1 to format2 (format2 can be 'all' if you want)""" # If format1 is 'example' and you don't specify phonemes, we take the words from the example lexicon. But don't say that in the help string because it might confuse the issue about phonemes being optional on the command line and prompted for if not specified and stdin is not piped in all formats other than 'example'.
2549 format1,format2 = sys.argv[i+1],sys.argv[i+2]
2550 if not format1 in lexFormats: return "No such format "+repr(format1)+" (use --formats to see a list of formats)"
2551 if not format2 in lexFormats and not format2=="all": return "No such format "+repr(format2)+" (use --formats to see a list of formats)"
2552 if format1=="example" and len(sys.argv)<=i+3:
2553 if stdin_is_terminal(): txt=""
2554 else: txt=getBuf(sys.stdin).read() # and it might still be ""
2555 if txt: parseIntoWordsAndClauses(format1,txt)
2556 else: clauses=[[x[1]] for x in getSetting('example','lex_read_function')()]
2557 else: clauses = parseIntoWordsAndClauses(format1,getInputText(i+3,"phonemes in "+format1+" format"))
2558 if format2=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")
2559 else: formats = [format2]
2560 for format2 in formats:
2561 if len(formats)>1: writeFormatHeader(format2)
2562 getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_header")))
2563 output_clauses(format2,convert(clauses,format1,format2))
2564 getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_footer"))) ; print("")
2565
2566 def parseIntoWordsAndClauses(format,phones):
2567 "Returns list of clauses, each of which is a list of words, assuming 'phones' are in format 'format'"
2568 wordSep = checkSetting(format,"word_separator") # don't use wordSeparator() here - we're splitting, not joining, so we don't want it to default to phoneme_separator
2569 clauseSep = checkSetting(format,"clause_separator","\n")
2570 def s(sep):
2571 if sep==" ": return None # " " means ANY whitespace (TODO: document this?)
2572 else: return maybe_bytes(sep,phones)
2573 if clauseSep and type(clauseSep) in [bytes,unicode]:
2574 clauses = phones.split(s(clauseSep))
2575 else: clauses = [phones]
2576 for i in range(len(clauses)):
2577 if wordSep: clauses[i]=clauses[i].split(s(wordSep))
2578 else: clauses[i] = [clauses[i]]
2579 clauses[i] = list(filter(lambda x:x, clauses[i]))
2580 return list(filter(lambda x:x,clauses))
2581
2582 def mainopt_mac_uk(i):
2583 """<from-format> [<text>]
2584 Speak text in Mac OS 10.7+ British voices while using a lexicon converted in from <from-format>. As these voices do not have user-modifiable lexicons, lexconvert must binary-patch your system's master lexicon; this is at your own risk! (Superuser privileges are needed the first time. A backup of the system file is made, and all changes are restored on normal exit but if you force-quit then you might need to restore the backup manually. Text speaking needs to be under lexconvert's control because it usually has to change the input words to make them fit the available space in the binary lexicon.) By default the Daniel voice is used; Emily or Serena can be selected by setting the MACUK_VOICE environment variable."""
2585 # If you have xterm etc, then text will also be printed, with words from the altered lexicon underlined.
2586 assert sys.version_info[0]==2, "--mac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"
2587 fromFormat = sys.argv[i+1]
2588 if not fromFormat in lexFormats: return "No such format "+repr(fromFormat)+" (use --formats to see a list of formats)"
2589 lex = get_macuk_lexicon(fromFormat)
2590 try:
2591 for line in getInputText(i+2,"text",True):
2592 m = MacBritish_System_Lexicon(line,os.environ.get("MACUK_VOICE","Daniel"))
2593 try: m.readWithLex(lex)
2594 finally: m.close()
2595 except KeyboardInterrupt:
2596 sys.stderr.write("Interrupted\n")
2597
2598 class Counter(object):
2599 "A simple class with two static members, count and subcount, for use by the consonant(), vowel() and other() functions"
2600 c=sc=0
2601 def other():
2602 "Used by Phonemes() when creating something that is neither a vowel nor a consonant, e.g. a stress mark"
2603 Counter.c += 1 ; Counter.sc=0 ; return Counter.c
2604 consonants = set() ; mainVowels = set()
2605 def consonant():
2606 "Used by Phonemes() when creating a consonant"
2607 r = other() ; consonants.add(r) ; return r
2608 def vowel():
2609 "Used by Phonemes() when creating a vowel"
2610 r = other() ; mainVowels.add(r) ; return r
2611 def opt_vowel():
2612 "Used by Phonemes() when creating an optional vowel (one that has no warning issued if some format doesn't support it)"
2613 return other()
2614 def variant():
2615 "Used by Phonemes() when creating a variant of the just-defined vowel/consonant/etc"
2616 Counter.sc += 1
2617 while str(Counter.sc).endswith('0'): Counter.sc += 1
2618 return 0, float('%d.%d' % (Counter.c,Counter.sc))
2619 # the 0 is so we can say _, name = variant()
2620 # so as to get some extra indentation
2621
2622 def ifset(var,a,b=""):
2623 "Checks the environment variable var; if it is set (non-empty), return a, otherwise return b. Used in LexFormats to create tables with variations set by the environment."
2624 import os
2625 if os.environ.get(var,""): return a
2626 else: return b
2627
2628 def speakjet(symbol,opcode):
2629 "Special-case function for the Speakjet table"
2630 assert type(opcode)==int
2631 if ifset('SPEAKJET_BINARY',1):
2632 assert not ifset('SPEAKJET_SYM',1), "Cannot set both SPEAKJET_SYM and SPEAKJET_BINARY"
2633 return chr(opcode)
2634 else: return ifset('SPEAKJET_SYM',symbol,str(opcode))
2635
2636 def makeDic(doc,*args,**kwargs):
2637 "Make a dictionary with a doc string, default-bidirectional mappings and extra settings; see LexFormats for how this is used."
2638 assert type(doc)==str, "doc must be a string"
2639 d = {} ; duplicates = set()
2640 for a in args:
2641 assert type(a)==tuple and (len(a)==2 or len(a)==3)
2642 k=a[0]
2643 if k in d: duplicates.add(k)
2644 v=a[1]
2645 assert (type(k) in [bytes,unicode] and type(v) in [int,float]) or (type(v) in [bytes,unicode] and type(k) in [int,float]), "Wrong types "+repr(a)+" (did you forget a _, before calling variant() or something?)"
2646 d[k] = v
2647 if type(k)==unicode: d[as_utf8(k)] = v
2648 if len(a)==3: bidir=a[2]
2649 else: bidir=True
2650 if bidir:
2651 # (k,v,True) = both (k,v) and (v,k)
2652 if v in d: duplicates.add(v)
2653 d[v] = k
2654 assert not duplicates, " Duplicate key(s) in "+repr(doc)+": "+", ".join((repr(dup)+"".join(" (="+g+")" for g,val in globals().items() if val==dup)) for dup in sorted(list(duplicates)))+". Did you forget a ,False to suppress bidirectional mapping?" # by the way, Python does not detect duplicate keys in {...} notation - it just lets you overwrite
2655 missing = [l for l in (list(consonants)+list(mainVowels)) if not l in d]
2656 # did_approx = False
2657 if missing and 'approximate_missing' in kwargs:
2658 for miss,approxTo in [
2659 # TODO: put this table somewhere else?
2660 # (If the thing on the right is just 1 item, we could make the thing on the left a variant of it. But that might not be a good idea unless they're really very close, since if it's a variant then the substitution is done without warning even if approximate_missing is not set.)
2661 (a_as_in_ago, [u_as_in_but]),
2662 (a_as_in_air, [e_as_in_them,r]),
2663 (ear, [e_as_in_eat,u_as_in_but]),
2664 (oor_as_in_poor, [close_to_or]), # TODO: ,r?
2665 (a_as_in_ah,[a_as_in_apple]), # this seems to be missing in some American voices (DecTalk, Keynote, SAM); TODO: is this the best approximation we can do?
2666 (a_as_in_apple,[a_as_in_ah]), # the reverse of the above, for Devanagari
2667 (o_as_in_orange,[oo_as_in_food]),(o_as_in_go,[oo_as_in_food]),(oy_as_in_toy,[oo_as_in_food,i_as_in_it]),(o_as_in_now,[a_as_in_ah, w]),(e_as_in_herd,[u_as_in_but,u_as_in_but]),(ar_as_in_year,[u_as_in_but,u_as_in_but]),(eye,[a_as_in_ah,y]),(th_as_in_think,[th_as_in_them]), # (Devanagari: is this really the best we can do?)
2668 ]:
2669 if miss in missing and all(x in d for x in approxTo):
2670 d[miss]=maybe_bytes(kwargs.get("phoneme_separator"," "),d[approxTo[0]]).join(d[x] for x in approxTo)
2671 # did_approx = True
2672 missing.remove(miss)
2673 # if did_approx: doc="(approx.) "+doc # and see also the code in makeVariantDic. Commenting out because this is misleading: the formats where we didn't do a did_approx might also contain approximations of some kind. Incidentally there are some British English voices that need approximate_missing (e.g. Apollo 2)
2674 d[("settings","doc")] = doc
2675 if missing:
2676 import sys ; sys.stderr.write("WARNING: Some non-optional vowels/consonants are missing from "+repr(doc)+"\nThe following are missing: "+", ".join("/".join(g for g,val in globals().items() if val==m) for m in missing)+"\n")
2677 for k,v in kwargs.items(): d[('settings',k)] = v
2678 assert type(d.get(('settings','cleanup_regexps'),[]))==list, "cleanup_regexps must be a list" # not one tuple
2679 assert type(d.get(('settings','cvtOut_regexps'),[]))==list, "cvtOut_regexps must be a list" # not one tuple
2680 wsep = d.get(('settings','word_separator'),None)
2681 psep = d.get(('settings','phoneme_separator'),' ')
2682 if not wsep==None: assert not wsep in d, "word_separator duplicates with a key in "+repr(doc)
2683 if not psep==None: assert not psep in d, "phoneme_separator duplicates with a key (did you forget to change the default, or to add a ,False somewhere?) in "+repr(doc)
2684 global lastDictionaryMade ; lastDictionaryMade = d
2685 return d
2686 def makeVariantDic(doc,*args,**kwargs):
2687 "Like makeDic but create a new 'variant' version of the last-made dictionary, modifying some phonemes and settings (and giving it a new doc string) but keeping everything else the same. Any list settings (e.g. cleanup_regexps) are ADDED to by the variant; other settings and phonemes are REPLACED if they are specified in the variant. If you don't want subsequent variants to inherit the changes made by this variant, add noInherit=True to the keyword args."
2688 global lastDictionaryMade
2689 ldmOld = lastDictionaryMade
2690 toUpdate = lastDictionaryMade.copy()
2691 global mainVowels,consonants
2692 oldV,oldC = mainVowels,consonants
2693 mainVowels,consonants = [],[] # so makeDic doesn't complain if some vowels/consonants are missing
2694 if 'noInherit' in kwargs:
2695 noInherit = kwargs['noInherit']
2696 del kwargs['noInherit']
2697 else: noInherit = False
2698 d = makeDic(doc,*args,**kwargs)
2699 if noInherit: lastDictionaryMade = ldmOld
2700 mainVowels,consonants = oldV,oldC
2701 # if toUpdate[("settings","doc")].startswith("(approx.) ") and not d[("settings","doc")].startswith("(approx.) "): d[("settings","doc")]="(approx.) "+d[("settings","doc")] # TODO: always?
2702 for k,v in toUpdate.items():
2703 if type(v)==list and k in d: d[k] = v+d[k]
2704 toUpdate.update(d) ; return toUpdate
2705 def getSetting(formatName,settingName):
2706 "Gets a setting from lexFormats, exception if not there"
2707 return lexFormats[formatName][('settings',settingName)]
2708 def checkSetting(formatName,settingName,default=""):
2709 "Gets a setting from lexFormats, default if not there"
2710 return lexFormats[formatName].get(('settings',settingName),default)
2711
2712 import sys,re,os
2713 try: from subprocess import getoutput
2714 except: from commands import getoutput # Python 2
2715 try: bytes # Python 3 and newer Python 2
2716 except: bytes = str # older Python 2
2717 try: unicode # Python 2
2718 except: # Python 3
2719 unicode,unichr,xrange = str,chr,range
2720 def chr(x): return bytes([x])
2721 _builtin_sorted = sorted
2722 from functools import cmp_to_key
2723 def sorted(l,theCmp=None):
2724 if theCmp:
2725 return _builtin_sorted(l,key=cmp_to_key(theCmp))
2726 else: return _builtin_sorted(l)
2727 assert sys.version_info[1] > 4, "lexconvert cannot run on Python 3.4 due to lack of byte-string percent formatting (PEP 461). Please use Python 3.5+ or stick with Python 2."
2728 def getBuf(f):
2729 "Return a buffer to which bytes may be written, for Python 2 and 3 compatibility"
2730 try: return f.buffer # Python 3
2731 except AttributeError: return f # Python 2
2732
2733 cached_sourceName,cached_destName,cached_dict = None,None,None
2734 def make_dictionary(sourceName,destName):
2735 "Uses lexFormats to make a mapping dictionary from a particular source format to a particular dest format, and also sets module variables for that particular conversion (TODO: put those module vars into an object in case someone wants to use this code in a multithreaded server)"
2736 global cached_sourceName,cached_destName,cached_dict
2737 if (sourceName,destName) == (cached_sourceName,cached_destName): return cached_dict
2738 source = lexFormats[sourceName]
2739 dest = lexFormats[destName]
2740 d = {}
2741 global dest_consonants ; dest_consonants = set()
2742 global dest_syllable_sep ; dest_syllable_sep = dest.get(syllable_separator,"")
2743 global implicit_vowel_before_NL
2744 implicit_vowel_before_NL = None
2745 for k,v in source.items():
2746 if type(k)==tuple: continue # settings
2747 if type(v) in [bytes,unicode]: continue # (num->string entries are for converting IN to source; we want the string->num entries for converting out)
2748 if not v in dest: v = int(v) # (try the main version of a variant)
2749 if not v in dest: continue # (haven't got it - will have to ignore or break into parts)
2750 assert type(k) in [bytes,unicode]
2751 d[k] = dest[v]
2752 if int(v) in consonants: dest_consonants.add(d[k])
2753 if int(v)==e_as_in_herd and (not implicit_vowel_before_NL or v==int(v)): # TODO: or u_as_in_but ? used by festival and some other synths before words ending 'n' or 'l' (see usage of implicit_vowel_before_NL later)
2754 implicit_vowel_before_NL = d[k]
2755 d[as_utf8(k)] = d[k]
2756 try: d[as_unicode(k)] = d[k]
2757 except UnicodeDecodeError: pass
2758 try:
2759 if any(type(v)==unicode for v in d.values()): d,dest_consonants=dict((k,as_unicode(v)) for k,v in d.items()),set(as_unicode(v) for v in dest_consonants) # Python 2: if ANY dest are Unicode, make them ALL Unicode
2760 except UnicodeDecodeError: d,dest_consonants=dict((k,as_utf8(v)) for k,v in d.items()),set(as_utf8(v) for v in dest_consonants) # ... or make them ALL byte-strings if some were binary and not readable as UTF-8
2761 cached_sourceName,cached_destName,cached_dict=sourceName,destName,d
2762 return d
2763
2764 warnedAlready = set()
2765 def convert(pronunc,source,dest):
2766 "Convert pronunc from source to dest. pronunc can be a string or a list; if a list then we'll recurse on each of the list elements and return a new list (this is meant for batch-converting clauses etc)"
2767 assert type(pronunc) in [bytes,unicode,list], type(pronunc)
2768 if source==dest: return pronunc # essential for --try experimentation with codes not yet supported by lexconvert
2769 if type(pronunc)==list: return [convert(p,source,dest) for p in pronunc]
2770 func = checkSetting(source,'cvtOut_func')
2771 if func: pronunc=func(pronunc)
2772 for s,r in checkSetting(source,'cvtOut_regexps'):
2773 pronunc=re.sub(maybe_bytes(s,pronunc),maybe_bytes(r,pronunc),pronunc)
2774 ret = [] ; toAddAfter = None
2775 dictionary = make_dictionary(source,dest)
2776 maxLen=max(len(l) for l in dictionary.keys())
2777 debugInfo=""
2778 separator = checkSetting(dest,'phoneme_separator',' ')
2779 safe_to_drop = checkSetting(source,"safe_to_drop_characters")
2780 while pronunc:
2781 for lettersToTry in range(maxLen,-1,-1):
2782 if not lettersToTry:
2783 if safe_to_drop==True: pass
2784 elif (not safe_to_drop) or not pronunc[:1] in maybe_bytes(safe_to_drop,pronunc) and not (pronunc[:1],debugInfo) in warnedAlready:
2785 warnedAlready.add((pronunc[:1],debugInfo))
2786 sys.stderr.write("Warning: ignoring "+source+" character "+repr(pronunc[:1])+debugInfo+" (unsupported in "+dest+")\n")
2787 pronunc=pronunc[1:] # ignore
2788 elif pronunc[:lettersToTry] in dictionary:
2789 debugInfo=" after "+as_printable(pronunc[:lettersToTry])
2790 toAdd=dictionary[pronunc[:lettersToTry]]
2791 assert type(toAdd) in [bytes,unicode], type(toAdd)
2792 isStressMark=(toAdd and toAdd in [maybe_bytes(lexFormats[dest].get(primary_stress,''),toAdd),maybe_bytes(lexFormats[dest].get(secondary_stress,''),toAdd)])
2793 if toAdd==maybe_bytes(lexFormats[dest].get(syllable_separator,''),toAdd): pass
2794 elif isStressMark and not checkSetting(dest,"stress_comes_before_vowel"):
2795 if checkSetting(source,"stress_comes_before_vowel"): toAdd, toAddAfter = maybe_bytes("",toAdd),toAdd # move stress marks from before vowel to after
2796 else: # stress is already after, but:
2797 # With Cepstral synth (and kana-approx), stress mark should be placed EXACTLY after the vowel and not any later. Might as well do this for others also.
2798 r=len(ret)-1
2799 while ret[r] in dest_consonants or ret[r].endswith(maybe_bytes("*added",ret[r])): r -= 1 # (if that raises IndexError then the input had a stress mark before any vowel) ("*added" condition is there so that implicit vowels don't get the stress)
2800 ret.insert(r+1,toAdd) ; toAdd=maybe_bytes("",toAdd)
2801 elif isStressMark and not checkSetting(source,"stress_comes_before_vowel"): # it's a stress mark that should be moved from after the vowel to before it
2802 i=len(ret)
2803 while i and (ret[i-1] in dest_consonants or ret[i-1].endswith(maybe_bytes("*added",ret[i-1]))): i -= 1
2804 if i: i-=1
2805 ret.insert(i,toAdd)
2806 if dest_syllable_sep: ret.append(maybe_bytes(dest_syllable_sep,toAdd)) # (TODO: this assumes stress marks are at end of syllable rather than immediately after vowel; correct for Festival; check others; probably a harmless assumption though; mac-uk is better with syllable separators although espeak basically ignores them)
2807 toAdd = maybe_bytes("",toAdd)
2808 # attempt to sort out the festival dictionary's (and other's) implicit_vowel_before_NL
2809 elif implicit_vowel_before_NL and ret and ret[-1] and toAdd in [maybe_bytes('n',toAdd),maybe_bytes('l',toAdd)] and ret[-1] in dest_consonants: ret.append(maybe_bytes(implicit_vowel_before_NL,toAdd)+maybe_bytes('*added',toAdd))
2810 elif len(ret)>2 and ret[-2].endswith(maybe_bytes('*added',ret[-2])) and toAdd and not toAdd in dest_consonants and not toAdd==dest_syllable_sep: del ret[-2]
2811 if toAdd:
2812 # Add it, but if toAdd is multiple phonemes, try to put toAddAfter after the FIRST phoneme
2813 if separator: toAddList=toAdd.split(separator)
2814 else: toAddList = [toAdd] # TODO: won't work for formats that don't have a phoneme separator (doesn't really matter for eSpeak though)
2815 ret.append(toAddList[0])
2816 if toAddAfter and not toAddList[0] in dest_consonants:
2817 ret.append(toAddAfter)
2818 toAddAfter=None
2819 ret += toAddList[1:]
2820 pronunc=pronunc[lettersToTry:]
2821 break
2822 if toAddAfter: ret.append(toAddAfter)
2823 if ret and ret[-1]==dest_syllable_sep: del ret[-1] # spurious syllable separator at end
2824 if not ret: ret = ""
2825 else: ret=maybe_bytes(separator,ret[0]).join(ret).replace(maybe_bytes('*added',ret[0]),maybe_bytes('',ret[0]))
2826 for s,r in checkSetting(dest,'cleanup_regexps'):
2827 ret=re.sub(maybe_bytes(s,ret),maybe_bytes(r,ret),ret)
2828 func = checkSetting(dest,'cleanup_func')
2829 if func: return func(ret)
2830 else: return ret
2831
2832 def unicode_preprocess(pronunc):
2833 "Special-case cvtOut_func for unicode-ipa etc: tries to catch \\uNNNN etc"
2834 if maybe_bytes("\\u",pronunc) in pronunc and not maybe_bytes('"',pronunc) in pronunc: # maybe \uNNNN copied from Gecko on X11, can just evaluate it to get the unicode
2835 # (NB make sure to quote the \'s if pasing in on the command line)
2836 try: pronunc=eval('u"'+pronunc+'"')
2837 except: pass
2838 else: # see if it makes sense as utf-8
2839 try: pronunc = pronunc.decode('utf-8')
2840 except: pass
2841 return pronunc
2842
2843 def ascii_braille_to_unicode(a):
2844 "Special-case cleanup_func for braille-ipa (set by braille-ipa if BRAILLE_UNICODE is set). Converts Braille ASCII to Unicode dot patterns."
2845 d=dict(zip(list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)="),[unichr(c) for c in range(0x2800,0x2840)]))
2846 return u''.join(d.get(c,c) for c in list(a))
2847 def unicode_to_ascii_braille(u):
2848 d=dict(zip([unichr(c) for c in range(0x2800,0x2840)],list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)=")))
2849 r=''.join(d.get(c,c) for c in list(as_unicode(u)))
2850 if r.startswith(",7") and r.endswith("7'"): r=r[2:-2]
2851 return r
2852
2853 def hiragana_to_katakana(u):
2854 "Special-case cleanup_func for kana-approx; converts all hiragana characters in unicode string 'u' into katakana if KANA_TYPE is set to anything beginning with a 'k'"
2855 assert type(u)==unicode
2856 if not os.environ.get("KANA_TYPE","").lower().startswith("k"): return u
2857 u = list(u)
2858 for i in xrange(len(u)):
2859 if 0x3041 <= ord(u[i]) <= 0x3096:
2860 u[i]=unichr(ord(u[i])+0x60)
2861 return u"".join(u)
2862
2863 def espeak_probably_right_already(existing_pronunc,new_pronunc):
2864 """Used by convert_system_festival_dictionary_to_espeak to compare a "new" pronunciation with eSpeak's existing pronunciation. As the transcription from OALD to eSpeak is only approximate, it could be that our new pronunciation is not identical to the existing one but the existing one is actually correct; try to detect when this happens by checking if the pronunciations are the same after some simplifications."""
2865 if existing_pronunc==new_pronunc: return True
2866 def simplify(pronunc): return \
2867 pronunc.replace(maybe_bytes(";",pronunc),maybe_bytes("",pronunc)).replace(maybe_bytes("%",pronunc),maybe_bytes("",pronunc)) \
2868 .replace(maybe_bytes("a2",pronunc),maybe_bytes("@",pronunc)) \
2869 .replace(maybe_bytes("3",pronunc),maybe_bytes("@",pronunc)) \
2870 .replace(maybe_bytes("L",pronunc),maybe_bytes("l",pronunc)) \
2871 .replace(maybe_bytes("I2",pronunc),maybe_bytes("i:",pronunc)) \
2872 .replace(maybe_bytes("I",pronunc),maybe_bytes("i:",pronunc)).replace(maybe_bytes("i@",pronunc),maybe_bytes("i:@",pronunc)) \
2873 .replace(maybe_bytes(",",pronunc),maybe_bytes("",pronunc)) \
2874 .replace(maybe_bytes("s",pronunc),maybe_bytes("z",pronunc)) \
2875 .replace(maybe_bytes("aa",pronunc),maybe_bytes("A:",pronunc)) \
2876 .replace(maybe_bytes("A@",pronunc),maybe_bytes("A:",pronunc)) \
2877 .replace(maybe_bytes("O@",pronunc),maybe_bytes("O:",pronunc)) \
2878 .replace(maybe_bytes("o@",pronunc),maybe_bytes("O:",pronunc)) \
2879 .replace(maybe_bytes("r-",pronunc),maybe_bytes("r",pronunc))
2880 # TODO: rewrite @ to 3 whenever not followed by a vowel?
2881 if as_printable(simplify(existing_pronunc))==as_printable(simplify(new_pronunc)): return True # almost the same, and festival @/a2 etc seems to be a bit ambiguous so leave it alone
2882
2883 def parse_festival_dict(festival_location):
2884 "For OALD; yields word,part-of-speech,pronunciation"
2885 ret = []
2886 for line in open(festival_location):
2887 line=line.strip()
2888 if "((pos" in line: line=line[:line.index("((pos")]
2889 if line.startswith('( "'): line=line[3:]
2890 line=line.replace('"','').replace('(','').replace(')','')
2891 try:
2892 word, pos, pronunc = line.split(None,2)
2893 except ValueError: continue # malformed line
2894 if pos not in ['n','v','a','cc','dt','in','j','k','nil','prp','uh']: continue # two or more words
2895 yield (word.lower(), pos, pronunc)
2896
2897 class Message(Exception): pass
2898 def convert_system_festival_dictionary_to_espeak(festival_location,check_existing_pronunciation,add_user_dictionary_also):
2899 "See mainopt_festival_dictionary_to_espeak"
2900 os.system("mv en_extra en_extra~") # start with blank 'extra' dictionary
2901 if check_existing_pronunciation: os.system("espeak --compile=en") # so that the pronunciation we're checking against is not influenced by a previous version of en_extra
2902 outFile=open("en_extra","w")
2903 print ("Reading dictionary lists")
2904 wordDic = {} ; ambiguous = {}
2905 el = open("en_list")
2906 for line in filter(lambda x:x.split() and not re.match(maybe_bytes(r'^[a-z]* *\$',x),x),getBuf(el).read().split(as_utf8('\n'))): ambiguous[line.split()[0]]=ambiguous[line.split()[0]+as_utf8('s')]=True # this stops the code below from overriding anything already in espeak's en_list. If taking out then you need to think carefully about words like "a", "the" etc.
2907 for word,pos,pronunc in parse_festival_dict(festival_location):
2908 pronunc=pronunc.replace("i@ 0 @ 0","ii ou 2 ").replace("i@ 0 u 0","ii ou ") # (hack for OALD's "radio"/"video"/"stereo"/"embryo" etc)
2909 pronunc=pronunc.replace("0","") # 0's not necessary, and OALD sometimes puts them in wrong places, confusing the converter
2910 if word in ['mosquitoes']: continue # OALD bug (TODO: any others?)
2911 if word in wordDic and not wordDic[word]==(pronunc,pos):
2912 ambiguous[as_utf8(word)] = True
2913 del wordDic[word] # better not go there
2914 if not as_utf8(word) in ambiguous:
2915 wordDic[word] = (pronunc, pos)
2916 toDel = []
2917 if check_existing_pronunciation:
2918 print ("Checking existing pronunciation")
2919 proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
2920 wList = []
2921 progressCount=0 ; oldPercent=-1
2922 itemList = list(wordDic.items())
2923 # Make sure it's NOT sorted, to ensure eSpeak doesn't
2924 # cache pronunciation of previous word when add suffix
2925 # (which can subtly change eSpeak's pronunciation in
2926 # some versions of eSpeak, leading to
2927 # Python 2/3 differences as Python 3 sorts by default) :
2928 itemList.sort()
2929 i0,i1 = itemList[:int(len(itemList)/2)],itemList[int(len(itemList)/2):]
2930 itemList = []
2931 while i0 or i1:
2932 if i0: itemList.append(i0.pop())
2933 if i1: itemList.append(i1.pop())
2934 for word,(pronunc,pos) in itemList:
2935 if check_existing_pronunciation:
2936 percent = int(progressCount*100/len(wordDic))
2937 if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()
2938 oldPercent=percent
2939 progressCount += 1
2940 if not re.match("^[A-Za-z]*$",word): # (some versions of eSpeak also OK with "-", but not all)
2941 # contains special characters - better not go there
2942 toDel.append(word)
2943 elif word.startswith("plaque") or word in "friday saturday sunday tuesday thursday yesterday".split():
2944 # hack to accept eSpeak's pl'ak instead of pl'A:k - order was reversed in the March 2009 draft
2945 toDel.append(word)
2946 elif word[-1]=="s" and word[:-1] in wordDic:
2947 # unnecessary plural (espeak will pick up on them anyway)
2948 toDel.append(word)
2949 elif word.startswith("year") or "quarter" in word: toDel.append(word) # don't like festival's pronunciation of those (TODO: also 'memorial' why start with [m'I])
2950 elif check_existing_pronunciation:
2951 getBuf(proc).write(as_utf8(word)+as_utf8("\n"))
2952 proc.flush() # so the progress indicator works
2953 wList.append(word)
2954 if check_existing_pronunciation:
2955 proc.close() ; print("")
2956 oldPronDic = {}
2957 tp = open("/tmp/.pronunc")
2958 for k,v in zip(wList,getBuf(tp).read().split(as_utf8("\n"))): oldPronDic[k]=v.strip().replace(as_utf8(" "),as_utf8(""))
2959 for w in toDel: del wordDic[w]
2960 print ("Doing the conversion")
2961 lines_output = 0
2962 total_lines = 0
2963 not_output_because_ok = []
2964 items = list(wordDic.items()) ; items.sort() # necessary because of the hacks below which check for the presence of truncated versions of the word (want to have decided whether or not to output those truncated versions before reaching the hacks)
2965 for word,(pronunc,pos) in items:
2966 total_lines += 1
2967 new_e_pronunc = convert(pronunc,"festival","espeak")
2968 if new_e_pronunc.count("'")==2 and not '-' in word: new_e_pronunc=new_e_pronunc.replace("'",",",1) # if 2 primary accents then make the first one a secondary (except on hyphenated words)
2969 # TODO if not en-rp? - if (word.endswith("y") or word.endswith("ie")) and new_e_pronunc.endswith("i:"): new_e_pronunc=new_e_pronunc[:-2]+"I"
2970 unrelated_word = None
2971 if check_existing_pronunciation: espeakPronunc = oldPronDic.get(word,"")
2972 else: espeakPronunc = ""
2973 if word[-1]=='e' and word[:-1] in wordDic: unrelated_word, espeakPronunc = word[:-1],"" # hack: if word ends with 'e' and dropping the 'e' leaves a valid word that's also in the dictionary, we DON'T want to drop this word on the grounds that espeak already gets it right, because if we do then adding 's' to this word may cause espeak to add 's' to the OTHER word ('-es' rule).
2974 if espeak_probably_right_already(espeakPronunc,new_e_pronunc):
2975 not_output_because_ok.append(word)
2976 continue
2977 if not unrelated_word: lines_output += 1
2978 getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+as_utf8(new_e_pronunc)+as_utf8(" // from Festival's (")+as_utf8(pronunc)+as_utf8(")"))
2979 if espeakPronunc: getBuf(outFile).write(as_utf8(", not [[")+as_utf8(espeakPronunc)+as_utf8("]]"))
2980 elif unrelated_word: getBuf(outFile).write(as_utf8(" (here to stop espeak's affix rules getting confused by Festival's \"")+as_utf8(unrelated_word)+as_utf8("\")"))
2981 getBuf(outFile).write(as_utf8("\n"))
2982 print ("Corrected(?) %d entries out of %d" % (lines_output,total_lines))
2983 if add_user_dictionary_also: convert_user_lexicon("festival","espeak",outFile)
2984 outFile.close()
2985 os.system("espeak --compile=en")
2986 if not_output_because_ok:
2987 print ("Checking for unwanted side-effects of those corrections") # e.g. terrible as Terr + ible, inducing as in+Duce+ing
2988 proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
2989 progressCount = 0
2990 for w in not_output_because_ok:
2991 getBuf(proc).write(as_utf8(w)+as_utf8("\n")) ; proc.flush()
2992 percent = int(progressCount*100/len(not_output_because_ok))
2993 if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()
2994 oldPercent = percent
2995 progressCount += 1
2996 proc.close()
2997 outFile=open("en_extra","a") # append to it
2998 tp = open("/tmp/.pronunc")
2999 for word,pronunc in zip(not_output_because_ok,getBuf(tp).read().split(as_utf8("\n"))):
3000 pronunc = pronunc.strip().replace(as_utf8(" "),as_utf8(""))
3001 if not pronunc==oldPronDic[word] and not espeak_probably_right_already(oldPronDic[word],pronunc):
3002 getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+oldPronDic[word]+as_utf8(" // (undo affix-side-effect from previous words that gave \"")+pronunc+as_utf8("\")\n"))
3003 outFile.close()
3004 os.system("espeak --compile=en")
3005 return not_output_because_ok
3006
3007 def read_user_lexicon(fromFormat):
3008 "Calls the appropriate lex_read_function, opening lex_filename first if supplied"
3009 readFunction = checkSetting(fromFormat,"lex_read_function")
3010 if not readFunction: raise Message("Reading from '%s' lexicon file not yet implemented (no lex_read_function); try using --phones or --phones2phones options instead" % (fromFormat,))
3011 try:
3012 lexFilename = getSetting(fromFormat,"lex_filename")
3013 if lexFilename==None: lexfile = None # e.g. the example lexicon
3014 else:
3015 lexfile = open(lexFilename)
3016 if not os.environ.get("LEXCONVERT_OMIT_READING_FROM",""): print ("Reading from "+lexFilename) # TODO: document LEXCONVERT_OMIT_READING_FROM (might be useful for the --mac-uk option)
3017 except KeyError: lexfile = None # lex_read_function without lex_filename is allowed, if the read function can take null param and fetch the lexicon itself
3018 except IOError: raise Message(fromFormat+"'s lexicon is expected to be in a file called "+replHome(lexFilename)+" which could not be read - please fix and try again")
3019 return readFunction(lexfile)
3020
3021 def replHome(fname):
3022 "Format fname for printing, substituting ~ for HOME if appropriate"
3023 h = os.environ.get('HOME','')
3024 if h and fname.startswith(h+os.sep):
3025 return "~"+fname[len(h):]
3026 else: return fname
3027
3028 def get_macuk_lexicon(fromFormat):
3029 "Converts lexicon from fromFormat and returns a list suitable for MacBritish_System_Lexicon's readWithLex"
3030 return [(word,convert(pronunc,fromFormat,"mac-uk")) for word, pronunc in read_user_lexicon(fromFormat)]
3031
3032 def as_utf8(s):
3033 if type(s)==unicode: return s.encode('utf-8')
3034 else: return s
3035 def as_unicode(s):
3036 if type(s)==unicode: return s
3037 else: return s.decode('utf-8')
3038 def maybe_bytes(s,i):
3039 "Python 2/3 compatibility: convert s to bytes if i is bytes"
3040 if type(i)==unicode: return s
3041 else: return as_utf8(s)
3042 def as_printable(s):
3043 if sys.version_info[0] < 3: return as_utf8(s)
3044 else: return as_utf8(s).decode('utf-8')
3045
3046 def convert_user_lexicon(fromFormat,toFormat,outFile):
3047 "See mainopt_convert"
3048 lex = read_user_lexicon(fromFormat)
3049 lex_header = checkSetting(toFormat,"lex_header")
3050 if type(lex_header) in [bytes,unicode]: getBuf(outFile).write(as_utf8(lex_header))
3051 else: lex_header(outFile)
3052 entryFormat=getSetting(toFormat,"lex_entry_format")
3053 wordCase=checkSetting(toFormat,"lex_word_case")
3054 for word, pronunc in lex:
3055 pronunc = as_utf8(convert(pronunc,fromFormat,toFormat))
3056 if wordCase=="upper": word=word.upper()
3057 elif wordCase=="lower": word=word.lower()
3058 getBuf(outFile).write(as_utf8(entryFormat) % (as_utf8(word),as_utf8(pronunc))) # will work in Python 3.6, but not in Python 3.4 (e.g. on jessie) which cannot do % on byte-strings
3059 footer = checkSetting(toFormat,"lex_footer")
3060 if type(footer) in [bytes,unicode]: getBuf(outFile).write(as_utf8(footer))
3061 else: footer(outFile)
3062
3063 def bbcMicro_partPhonemeCount(pronunc):
3064 """Returns the number of 'part phonemes' (at least that's what I'm calling them) for the BBC Micro phonemes in pronunc. The *SPEAK command cannot take more than 117 part-phonemes at a time before saying "Line too long", and in some cases it takes less than that (I'm not sure why); 115 is a safer limit."""
3065 partCount = 0 ; pronunc0 = pronunc
3066 while pronunc:
3067 found = 0
3068 for p in ' ,AA,AE,AH,AI,AO,AW,AY,B,CH,CT,DH,DUX,D,EE,EH,ER,F,G,/H,IH,IX,IY,J,K,L,M,NX,N,OW,OL,OY,O,P,R,SH,S,TH,T,UH,/UL,/U,UW,UX,V,W,Y,ZH,Z'.split(','): # phonemes and space count, but pitch numbers do not count
3069 if pronunc.startswith(as_utf8(p)):
3070 partCount += {
3071 # *SPEAK can take 117 of most single-letter phonemes, or 116 (limited by the 232+6-character input limit) of most 2-letter phonemes
3072 'AW':2,'IY':2,'OW':2,'OL':2,'UW':2,'/UL':2, # *SPEAK can take 58 of these
3073 'DUX':3,'AY':3,'CH':3,'J':3,'OY':3, # *SPEAK can take 39 of these
3074 'CT':4, # *SPEAK can take 29 of these
3075 }.get(p,1)
3076 pronunc=pronunc[len(p):] ; found=1 ; break
3077 if not found:
3078 assert as_printable(pronunc[:1]) in '12345678',"Unrecognised BBC Micro phoneme at "+str(pronunc)+" in "+str(pronunc0)
3079 pronunc=pronunc[1:]
3080 return partCount
3081
3082 def markup_inline_word(format,pronunc):
3083 "Returns pronunc with any necessary markup for putting it in a text (using the inline_format setting)"
3084 pronunc = as_utf8(pronunc) # UTF-8 output - ok for pasting into Firefox etc *IF* the terminal/X11 understands utf-8 (otherwise redirect to a file, point the browser at it, and set encoding to utf-8, or try --convert'ing which will o/p HTML)
3085 format = checkSetting(format,"inline_format","%s")
3086 if type(format) in [bytes,unicode]:
3087 if type(format)==unicode: format=format.encode('utf-8') # see above
3088 return format % pronunc
3089 else: return format(pronunc)
3090 def markup_doubleTalk_word(pronunc):
3091 "Special-case function set as inline_format in doubletalk (checks environment variables for command code)"
3092 cmd = os.environ.get('DTALK_COMMAND_CODE','')
3093 if cmd: cmd=chr(int(cmd))
3094 else: cmd = as_utf8('*')
3095 return as_utf8("%sD%s%sT") % (cmd,pronunc,cmd)
3096 def markup_bbcMicro_word(pronunc):
3097 "Special-case function set as inline_format in bbcmicro. Begins a new *SPEAK command when necessary. See also write_bbcmicro_phones."
3098 global bbc_partsSoFar,bbc_charsSoFar
3099 thisPartCount = bbcMicro_partPhonemeCount(pronunc)
3100 if (not bbc_partsSoFar or bbc_partsSoFar+thisPartCount > 115) or (not bbc_charsSoFar or bbc_charsSoFar+len(pronunc) > 238): # 238 is max len of BBC BASIC prompt (both the immediate prompt and the one with line number supplied by AUTO, in both BASIC II and BASIC IV); re other limit see bbcMicro_partPhonemeCount
3101 if bbc_charsSoFar: r="\n"
3102 else: r=""
3103 cmd="*SPEAK" # (could add a space if want to make it more readable, at the expense of an extra keystroke in the paste buffer; by the way, when not using the ROM version you must use *SPEAK not OS.("SPEAK"), at least on a Model B; seems OSCLI doesn't go through quite the same vectors as star)
3104 bbc_charsSoFar = len(cmd)+len(pronunc)+1 # +1 for the space that'll be after this word if we don't start a new line
3105 bbc_partsSoFar = thisPartCount+1 # ditto
3106 return as_utf8(r+cmd)+pronunc
3107 else:
3108 bbc_charsSoFar += len(pronunc)+1
3109 bbc_partsSoFar += thisPartCount+1
3110 return pronunc
3111 bbc_partsSoFar=bbc_charsSoFar=0
3112
3113 def sylcount(example_format_festival):
3114 """Tries to count the number of syllables in a Festival string (see mainopt_syllables). We treat @ as counting the same as the previous syllable (e.g. "fire", "power"), but this can vary in different songs, so the result will likely need a bit of proofreading."""
3115 count = inVowel = maybeCount = hadAt = 0
3116 festival = example_format_festival.split() # no brackets, emphasis by vowels, but spaces between each syllable
3117 for phone,i in zip(festival,range(len(festival))):
3118 if phone[:1] in "aeiou": inVowel=0 # unconditionally start new syllable
3119 if phone[:1] in "aeiou@12":
3120 if not inVowel: count += 1
3121 elif phone[:1]=="@" and not hadAt: maybeCount = 1 # (e.g. "loyal", but NOT '1', e.g. "world")
3122 if "@" in phone: hadAt = 1 # for words like "cheerful" ("i@ 1 @" counts as one)
3123 inVowel = 1
3124 if phone[:1]=="@" and i>=3 and festival[i-2:i]==["ai","1"] and festival[i-3] in ["s","h"]: # special rule for higher, Messiah, etc - like "fire" but usually 2 syllables
3125 maybeCount = 0 ; count += 1
3126 else:
3127 if not phone[:1] in "drz": count += maybeCount # not 'r/z' e.g. "ours", "fired" usually 1 syllable in songs, "desirable" usually 4 not 5
3128 # TODO steward? y u@ 1 d but usally 2 syllables
3129 inVowel = maybeCount = hadAt = 0
3130 return count
3131 def hyphenate(word,numSyls):
3132 "See mainopt_syllables"
3133 orig = word
3134 try: word,isu8 = word.decode('utf-8'),True
3135 except: isu8 = False
3136 pre=[] ; post=[]
3137 while word and not 'a'<=word[:1].lower()<='z':
3138 pre.append(word[:1]) ; word=word[1:]
3139 while word and not 'a'<=word[-1].lower()<='z':
3140 post.insert(0,word[-1:]) ; word=word[:-1]
3141 if numSyls>len(word): return orig # probably numbers or something
3142 l = int((len(word)+numSyls/2)/numSyls) ; syls = []
3143 for i in range(numSyls):
3144 if i==numSyls-1: syls.append(word[i*l:])
3145 else: syls.append(word[i*l:(i+1)*l])
3146 if len(syls)>1:
3147 if syls[-1].startswith('-') or (len(syls[-1])>2 and syls[-1][:1]==syls[-1][1:2] and not syls[-1][:1].lower() in "aeiou"):
3148 # repeated consonant at start - put one on previous
3149 # (or hyphen at start - move it to the previous)
3150 syls[-2] += syls[-1][:1]
3151 syls[-1] = syls[-1][1:]
3152 elif len(syls[-1])>2 and syls[-1][1]=='-':
3153 # better move this splitpoint after that hyphen (TODO: move more than one character?)
3154 syls[-2] += syls[-1][:2]
3155 syls[-1] = syls[-1][2:]
3156 elif ((len(syls[-2])>2 and syls[-2][-1]==syls[-2][-2] and not syls[-2][-1].lower() in "aeiou") \
3157 or (syls[-1] and syls[-1][:1].lower() in "aeiouy" and len(syls[-2])>2)) \
3158 and list(filter(lambda x:x.lower() in "aeiou",list(syls[-2][:-1]))):
3159 # repeated consonant at end - put one on next
3160 # or vowel on right: move a letter over (sometimes the right thing to do...)
3161 # (unless doing so leaves no vowels)
3162 syls[-1] = syls[-2][-1]+syls[-1]
3163 syls[-2] = syls[-2][:-1]
3164 word = ''.join(pre)+"- ".join(syls)+''.join(post)
3165 if isu8: word=word.encode('utf-8')
3166 return word
3167
3168 def macSayCommand():
3169 """Return the environment variable SAY_COMMAND if it is set and if it is non-empty, otherwise return "say".
3170 E.g. SAY_COMMAND="say -o file.aiff" (TODO: document this in the help text?)
3171 In Gradint you can set (e.g. if you have a ~/.festivalrc) extra_speech=[("en","python lexconvert.py --mac-uk festival")] ; extra_speech_tofile=[("en",'echo %s | SAY_COMMAND="say -o /tmp/said.aiff" python lexconvert.py --mac-uk festival && sox /tmp/said.aiff /tmp/said.wav',"/tmp/said.wav")]"""
3172 s = os.environ.get("SAY_COMMAND","")
3173 if s: return s
3174 else: return "say"
3175
3176 def stdin_is_terminal():
3177 "Returns True if it seems the standard input is connected to a terminal (rather than piped from a file etc)"
3178 return (not hasattr(sys.stdin,"isatty")) or sys.stdin.isatty()
3179
3180 def getInputText(i,prompt,as_iterable=False):
3181 """Gets text either from the command line or from standard input. Issue prompt if there's nothing on the command line and standard input is connected to a tty instead of a pipe or file. If as_iterable, return an iterable object over the lines instead of reading and returning all text at once. If as_iterable=='maybe', return the iterable but if not reading from a tty then read everything into one item."""
3182 txt = ' '.join(sys.argv[i:])
3183 if txt:
3184 if as_iterable=='maybe': return [txt]
3185 elif as_iterable: return txt.split('\n')
3186 else: return txt
3187 if stdin_is_terminal(): sys.stderr.write("Enter "+prompt+" (EOF when done)\n")
3188 elif as_iterable=='maybe': return [getBuf(sys.stdin).read()]
3189 if as_iterable: return my_xreadlines()
3190 else:
3191 try: return getBuf(sys.stdin).read()
3192 except KeyboardInterrupt: raise SystemExit
3193
3194 try: raw_input # Python 2
3195 except NameError: raw_input = input # Python 3
3196 def my_xreadlines():
3197 "On some platforms this might be a bit more responsive than sys.stdin.xreadlines"
3198 while True:
3199 try: yield raw_input()
3200 except EOFError: return
3201 except KeyboardInterrupt: raise SystemExit
3202
3203 def output_clauses(format,clauses):
3204 "Writes out clauses and words in format 'format' (clauses is a list of lists of words in the phones of 'format'). By default, calls markup_inline_word and join as appropriate. If however the format's 'clause_separator' has been set to a special case, calls that."
3205 if checkSetting(format,"output_is_binary") and hasattr(sys.stdout,"isatty") and sys.stdout.isatty():
3206 print ("This is a binary format - not writing to terminal.\nPlease direct output to a file or pipe.")
3207 return
3208 clause_sep = checkSetting(format,"clause_separator","\n")
3209 if type(clause_sep) in [bytes,unicode]: getBuf(sys.stdout).write(as_utf8(clause_sep).join(as_utf8(wordSeparator(format)).join(markup_inline_word(format,word) for word in clause) for clause in clauses))
3210 else: clause_sep(clauses)
3211 def write_bbcmicro_phones(clauses):
3212 """Special-case function set as clause_separator in bbcmicro format. Must be a special case because it needs to track any extra keystrokes to avoid "Line too long". And while we're at it, we might as well start a new *SPEAK command with each clause, using the natural brief delay between commands; this should minimise the occurrence of additional delays in arbitrary places. Also calls print_bbc_warnings"""
3213 totalKeystrokes = 0 ; lines = 0
3214 for clause in clauses:
3215 global bbc_charsSoFar ; bbc_charsSoFar=0
3216 l=as_utf8(" ").join([markup_inline_word("bbcmicro",word) for word in clause])
3217 getBuf(sys.stdout).write(l.replace(as_utf8(" \n"),as_utf8("\n")))
3218 totalKeystrokes += len(l)+1 ; lines += 1
3219 print_bbc_warnings(totalKeystrokes,lines)
3220 def print_bbc_warnings(keyCount,lineCount):
3221 "Print any relevant size warnings regarding sending 'keyCount' keys in 'lineCount' lines to the BBC Micro"
3222 sys.stdout.flush() # try to keep in sync if someone's doing 2>&1 | less
3223 limits_exceeded = [] ; severe=0
3224 if keyCount >= 32768:
3225 severe=1 ; limits_exceeded.append("BeebEm 32K keystroke limit") # At least in version 3, the clipboard is defined in beebwin.h as a char of size 32768 and its bounds are not checked. Additionally, if you script a second paste before the first has finished (or if you try to use BeebEm's Copy command) then the first paste will be interrupted. So if you really want to make BeebEm read more then I suggest setting a printer destination file, putting a VDU 2,10,3 after each batch of commands, and waiting for that \n to appear in that printer file before sending the next batch, or perhaps write a set of programs to a disk image and have them CHAIN each other or whatever.
3226 shadow_himem=0x8000 # if using a 'shadow mode' on the Master/B+/Integra-B (modes 128-135, which leave all main RAM free)
3227 mode7_himem=0x7c00 # (40x25 characters = 1000 bytes, by default starting at 7c00 with 24 bytes spare at the top, but the scrolling system uses the full 1024 bytes and can tell the video controller to start rendering at any one of them; if you get Jeremy Ruston's book and program the VIDC yourself then you could fix it at 7c18 if you really want, or just set HIMEM=&8000 and don't touch the screen, but that doesn't give you very much more room)
3228 default_speech_loc=0x5500
3229 overhead_per_program_line = 4
3230 for page,model in [
3231 (0x1900,"Model B"), # with Acorn DFS (a reasonable assumption although alternate DFS ROMs are different)
3232 (0xE00,"Master")]: # (the Master has 8k of special paged-in "filing system RAM", so doesn't need 2816 bytes of main RAM for DFS)
3233 top = page+keyCount+lineCount*(overhead_per_program_line-1)+2 # the -1 is because keyCount includes a carriage return at the end of each line
3234 if model=="Master": x=" (use Speech's Sideways RAM version instead, e.g. *SRLOAD SP8000 8000 7 and reset, but sound quality might be worse)" # I don't know why but SP8000 can play higher and more distorted than SPEECH, at least on emulation (and changing the emulation speed doesn't help, because that setting, at least in BeebEm3, just controls extra usleep every frame; it doesn't actually slow down the 6502 *between* frames; anyway timing of sound changes is done by CyclesToSamples stuff in beebsound.cc's SoundTrigger). If on the Master you go into View (*WORD) and then try SP8000, it plays _lower_ than *SPEECH (even if you do *BASIC first) and *SAY can corrupt a View document; ViewSheet (*SHEET) doesn't seem to have this effect; neither does *TERMINAL but *SAY can confuse the terminal.
3235 # Re bank numbers, by default banks 4 to 7 are Sideways RAM (4*16k=64k) and I suppose filling up from 7 makes sense because banks 8-F are ROMs (ANFS,DFS,ViewSheet,Edit,BASIC,ADFS,View,Terminal; OS is a separate 16k so there's scope for 144k of supplied ROM). Banks 0-3 are ROM expansion slots. The "128" in the name "Master 128" comes from 32k main RAM, 64k Sideways RAM, 20k shadow RAM (for screen modes 128-135), 4k OS "private RAM" (paged on top of 8000-8FFF) and 8k filing system RAM (paged on top of C000-DFFF) = 128k. Not sure what happened on the B+.
3236 # By the way BeebEm's beebsound.cc also shows us why SOUND was always out of tune especially in the higher pitches. The 16-bit freqval given to the chip is 125000/freq and must be an integer, so the likely temperament in cents for non-PCM is given by [int(math.log(125000.0/math.ceil(125000/freq)/freq,2**(1.0/1200))) for freq in [440*((2**(1.0/12))**semi) for semi in range(-12*3+2,12*2+6)]] (the actual temperament will depend on the OS's implementation of mapping SOUND pitch values to freqval's, unless you program the chip directly, but this list is indicative and varies over 10% in the top 2 octaves)
3237 # Some other ROMs (e.g. Alan Blundell's "Informant" 1989) seem to result in a crash after the *SPEECH and/or *SPEAK commands complete, at least in some emulator configurations; this may or may not be resolved via timing adjustments or adjustments in the ROM order; not sure exactly what the problem is
3238 else: x=" (Speech program will be overwritten unless relocated)" # (could use Sideways RAM for it instead if you have it fitted, see above)
3239 if top > default_speech_loc: limits_exceeded.append("%s TOP=&%X limit%s" % (model,default_speech_loc,x)) # The Speech program does nothing to stop your program (or its variables etc) from growing large enough to overwrite &5500, nor does it stop the stack pointer (coming down from HIMEM) from overwriting &72FF. For more safety on a Model B you could use RELOCAT to put Speech at &5E00 and be sure to set HIMEM=&5E00 before loading, but then you must avoid commands that change HIMEM, such as MODE (but selecting any non-shadow mode other than 7 will overwrite Speech anyway, although if you set the mode before loading Speech then it'll overwrite screen memory and still work as long as the affected part of the screen is undisturbed). You can't do tricks like ditching the lexicon because RELOCAT won't let you go above 5E00 (unless you fix it, but I haven't looked in detail; if you can fix RELOCAT to go above 5E00 then you can create a lexicon-free Speech by taking the 1st 0x1560 bytes of SPEECH and append two * bytes, relocate to &6600 and set HIMEM, but don't expect *SAY to work, unless you put a really small lexicon into the spare 144 bytes that are left - RELOCAT needs an xx00 address so you can't have those bytes at the bottom). You could even relocate to &6A00 and overwrite (non-shadow) screen memory if you don't mind the screen being filled with gibberish that you'd better not erase! (well if you program the VIDC as mentioned above and you didn't re-add a small lexicon then you could get yourself 3.6 lines of usable Mode 7 display from the spare bytes but it's probably not worth the effort)
3240 if top > mode7_himem:
3241 if model=="Master":
3242 if top > shadow_himem: limits_exceeded.append(model+" 32k HIMEM limit (even for shadow modes)") # TODO: maybe add instructions for using BAS128 on the B+ or Master; this sets PAGE=&10000 and HIMEM=&20000 (i.e. 64k for programs), which uses all 4 SRAM slots so you can't use SP8000 (unless it's on a real ROM); if using Speech in main memory you need to RELOCAT it to leave &3000 upwards for Bas128 code; putting it at &1900 for B+/DFS leaves you only 417 bytes for lexicon (which might not matter if you're using only *SPEECH: just create a shortened lexicon); putting it at &E00 for Master allows space for the default 2204-byte lexicon with 1029 bytes to spare; TODO check if Bas128 uses any workspace between &E00 and &3000 though. Alternatively (if you really want to store such a long program on the BBC) then you'd better split it into several programs that CHAIN each other (as mentioned above).
3243 else: limits_exceeded.append(model+" Mode 7 HIMEM limit (use shadow modes 128-135)")
3244 else: limits_exceeded.append(model+" Mode 7 HIMEM limit") # unless you overwrite the screen (see above) - let's assume the Model B hasn't been fitted with shadow modes (although the Integra-B add-on does give them to the Model B, and leaves PAGE at &1900; B+ has shadow modes but I don't know what's supposed to happen to PAGE on it). 65C02 Tube doesn't help much (it'll try to run Speech on the coprocessor instead of the host, and this results in silence because it can't send its sound back across the Tube; don't know if there's a way to make it run on the host in these circumstances or what the host's memory map is like)
3245 if lineCount > 32768: limits_exceeded.append("BBC BASIC line number limit") # and you wouldn't get this far without filling the memory, even with 128k (4 bytes per line)
3246 elif 10*lineCount > 32767: limits_exceeded.append("AUTO line number limit (try AUTO 0,1)") # (default AUTO increments in steps of 10; you can use AUTO 0,1 to start at 0 and increment in steps of 1. BBC BASIC stores its line info in a compact form which allows a range of 0-32767.)
3247 if severe: warning,after="WARNING: ",""
3248 else: warning,after="Note: ","It should still work if pasted into BeebEm as immediate commands. "
3249 after = ". "+after+"See comments in lexconvert for more details.\n"
3250 if len(limits_exceeded)>1: sys.stderr.write(warning+"this text may be too big for the BBC Micro. The following limits were exceeded: "+", ".join(limits_exceeded)+after)
3251 elif limits_exceeded: sys.stderr.write(warning+"this text may be too big for the BBC Micro because it exceeds the "+limits_exceeded[0]+after)
3252 def bbc_prepDefaultLex(outFile):
3253 """Special-case function set as lex_header in bbcmicro format. If SPEECH_DISK and MAKE_SPEECH_ROM is set, then read the ROM code from SPEECH_DISK and write to outFile (meant to go before the lexicon, to make a modified BBC Micro Speech ROM with custom lexicon)"""
3254 if not os.environ.get("MAKE_SPEECH_ROM",0): return
3255 sd = open(os.environ['SPEECH_DISK'])
3256 d=getBuf(sd).read() # if this fails, SPEECH_DISK was not set or was set incorrectly (it's required for MAKE_SPEECH_ROM)
3257 i=d.index(as_utf8('LO')+chr(0x80)+as_utf8('LP')+chr(0x80)+chr(0x82)+chr(0x11)) # start of SP8000 file (if this fails, it wasn't a Speech disk)
3258 j=d.index(as_utf8('>OUS_'),i) # start of lexicon (ditto)
3259 assert j-i==0x1683, "Is this really an original disk image?"
3260 getBuf(outFile).write(d[i:j])
3261 def bbc_appendDefaultLex(outFile):
3262 """Special-case function set as lex_footer in bbcmicro format. If SPEECH_DISK is set, read Speech's default lexicon from it and append this to outFile. Otherwise just write a terminating >** to outFile. In either case, check for exceeding 16k if we're MAKE_SPEECH_ROM, close the file and call print_bbclex_instructions."""
3263 if os.environ.get("SPEECH_DISK",""):
3264 sd = open(os.environ['SPEECH_DISK'])
3265 d=getBuf(sd).read()
3266 i=d.index(as_utf8('>OUS_')) # if this fails, it wasn't a Speech disk
3267 j=d.index(as_utf8(">**"),i)
3268 assert j-i==2201, "Lexicon on SPEECH_DISK is wrong size (%d). Is this really an original disk image?" % (j-i)
3269 getBuf(outFile).write(d[i:j])
3270 # TODO: can we compress the BBC lexicon? i.e. detect if a rule will happen anyway due to subsequent wildcard rules, and delete it if so (don't know how many bytes that would save)
3271 outFile.write(">**")
3272 fileLen = outFile.tell()
3273 assert not os.environ.get("MAKE_SPEECH_ROM",0) or fileLen <= 16384, "Speech ROM file got too big (%d)" % fileLen
3274 outFile.close()
3275 print_bbclex_instructions(getSetting("bbcmicro","lex_filename"),fileLen)
3276
3277 def bbcshortest(n):
3278 """Convert integer n into the shortest possible number of BBC Micro keystrokes; prefer hex if and only if the extra '&' keystroke won't make it any longer than its decimal equivalent"""
3279 if len(str(n)) < len('&%X'%n): return as_utf8(str(n))
3280 else: return as_utf8('&%X'%n)
3281 def bbcKeystrokes(data,start):
3282 "Return BBC BASIC keystrokes to put data into RAM starting at address start, without using the BASIC heap in the process (although we do use one of the page-4 integer variables to save some keystrokes). Assumes the data is mostly ASCII so the $ operator is the least-keystrokes method of getting it in (rather than ? and ! operators, assembler EQUB/EQUW/EQUS, 6502 mnemonics, etc); we don't mind about overwriting the byte after with a CHR$(13). Keystrokes are limited to ASCII for easier copy/paste. See comments for more details."
3283 # Taken to the extreme, a 'find the least keystrokes' function would be some kind of data compressor; we're not doing that here as we assume this is going to be used to poke in a lexicon, which is basically ASCII with a few CHR$(128)s thrown in; this '$ operator' method is highly likely to yield the least keystrokes for that kind of data, apart from setting and using temporary string variables, but then (1) you're in the realms of data compression and (2) you require heap memory, which might not be a good idea depending on where we're putting our lexicon.
3284 # I suppose it wouldn't hurt in most cases to have an A$=CHR$(128), but not doing this for now because you might be in a situation where you can't touch the heap at all (I'm not sure where the workspace for assembling strings is though).
3285 # However, just to be pedantic about saving a few bytes, there is one thing we CAN do: if we have a lexicon with a lot of CHR$(128)s in it, let's set up BASIC's page-4 integer variables such that $A%=CHR$(128), saving 6 keystrokes per entry without needing the heap (an additional 1 keystroke per entry could be saved if we didn't mind putting an A$ on the heap).
3286 use_int_hack = ((start>=1030 or start+len(data)<=1026) and len(data.split(chr(128))) >= 4)
3287 i=0 ; ret=[]
3288 if use_int_hack: thisLine = as_utf8("A%=&408:B%=&D80:") # (@% is at &400 and each is 4 byte LSB-MSB; $x reads to next 0D)
3289 # (If we're guaranteed to NOT be using Bas128 and therefore all memory addresses are effectively masked by &FFFF, we can instead set A%=&D800406 (using A%'s low 2 bytes to point to A%'s high 2 bytes) for a 1-off saving of 5 keystrokes and 1 page-4 variable, but this saving is not really worth the readability compromise and the risk posed by the possibility of Bas128 - I don't know how Bas128 treats addresses above &1FFFF)
3290 # (An even 'nastier' trick would be to put !13=&D80 and then use $13, as those bytes are used by BASIC's random number generator, which presumably isn't called during the paste and we don't mind disrupting it; again I don't know about Bas128. But you can't do it because BASIC gives a "$ range" error on anything below 256.)
3291 # (I suppose one thing you _could_ do is LOMEM=&400:A$=CHR$(13) and end with LOMEM=TOP, which would overwrite 3 page-4 variables and let you use just A$ instead of $A%, saving keystrokes over A%=&D800406 after 21 more lexicon words, at the expense of losing track of any variables you had on the heap. But this is getting silly.)
3292 else: thisLine = as_utf8("")
3293 bbc_max_line_len = 238
3294 inQuote=needPlus=0 ; needCmd=1
3295 while i<len(data):
3296 if needCmd:
3297 thisLine += (as_utf8('$')+bbcshortest(start)+as_utf8('='))
3298 inQuote=needPlus=needCmd=0
3299 if data[i:i+1]==as_utf8('"'): c,inQ = as_utf8('""'),1 # inQ MUST be 0 or 1, not False/True, because it's also used as 'len of necessary close quote' below
3300 elif 32<=ord(data[i:i+1])<127: c,inQ = data[i:i+1],1
3301 elif use_int_hack and ord(data[i:i+1])==128: c,inQ=as_utf8("$A%"),0
3302 else: c,inQ=(as_utf8("CHR$("+str(ord(data[i:i+1]))+")")),0
3303 addToLine = [] ; newNeedPlus = needPlus
3304 if inQ and not inQuote:
3305 if needPlus: addToLine.append(as_utf8('+'))
3306 addToLine.append(as_utf8('"'))
3307 newNeedPlus=0
3308 elif inQuote and not inQ:
3309 addToLine.append(as_utf8('"+'))
3310 newNeedPlus=1 # after what we'll add
3311 elif not inQ:
3312 if needPlus: addToLine.append(as_utf8('+'))
3313 newNeedPlus=1 # after what we'll add
3314 addToLine.append(c)
3315 addToLine=as_utf8('').join(addToLine)
3316 if len(thisLine)+len(addToLine)+inQ > bbc_max_line_len: # oops, we've gone too far, back off and end prev line
3317 if inQuote: thisLine += as_utf8('"')
3318 ret.append(thisLine)
3319 thisLine=as_utf8("") ; needCmd=1 ; continue
3320 thisLine += addToLine ; inQuote=inQ
3321 needPlus=newNeedPlus ; i += 1 ; start += 1
3322 if inQuote: thisLine += as_utf8('"')
3323 if not needCmd: ret.append(thisLine)
3324 return as_utf8('\n').join(ret)+as_utf8('\n')
3325 def print_bbclex_instructions(fname,size):
3326 """Print suitable instructions for a BBC Micro lexicon of the given filename and size (the exact nature of the instructions depends on the size). If appropriate, create a .key file containing keystrokes for transferring to an emulator."""
3327 if os.environ.get("MAKE_SPEECH_ROM",0): print ("%s (%d bytes, hex %X) can now installed on an emulator (set in Roms.cfg or whatever), or loaded onto a chip. The sound quality of this might be worse than that of the main-RAM version." % (fname,size,size)) # (at least on emulation - see comment on sound quality above)
3328 else:
3329 print ("The size of this lexicon is %d bytes (hex %X)" % (size,size)) # (the default lexicon is 2204 bytes)
3330 bbcStart=None
3331 noSRAM_lex_offset=0x155F # (on the BBC Micro, SRAM means Sideways RAM, not Static RAM as it does elsewhere; for clarity we'd better say "Sideways RAM" in all output)
3332 SRAM_lex_offset=0x1683
3333 SRAM_max=0x4000 # 16k
3334 noSRAM_default_addr=0x5500
3335 noSRAM_min_addr=0xE00 # minimum supported by RELOCAT
3336 page=0x1900 # or 0xE00 for Master (but OK to just leave this at 0x1900 regardless of model; it harmlessly increases the range where special_relocate_instructions 'kick in')
3337 noSRAM_himem=0x7c00 # unless you're in a shadow mode or something (see comments on himem above), however leaving this at 0x7c00 is usually harmless (just causes the 'need to relocate' to 'kick in' earlier, although if memory is really full it might say 'too big' 1k too early)
3338 def special_relocate_instructions(reloc_addr):
3339 pagemove_min,pagemove_max = max(0xE00,page-0x1E00), page+0xE00 # if relocating to within this range, must move PAGE before loading RELOCAT. RELOCAT's supported range is 0xE00 to 0x5E00, omitting (PAGE-&1E00) to (PAGE+&E00)
3340 if reloc_addr < 0x1900: extra=" On a Model B with Acorn DFS you won't be able to use the disk after relocating below &1900, and you can't run star commands from tape so you have to initialise via CALL. (On a Master, DFS is not affected as it doesn't use &E00-&1900.)"
3341 else: extra = ""
3342 if not pagemove_min<=reloc_addr<pagemove_max:
3343 return extra # no other special instructions needed
3344 newpage = reloc_addr+0x1E00
3345 page_max = min(0x5E00,noSRAM_default_addr-0xE00)
3346 if newpage > page_max: return False # "Unfortunately RELOCAT can't put it at &%X even with PAGE changes." % reloc_addr
3347 return " Please run RELOCAT with PAGE in the range of &%X to &%X for this relocation to work.%s" % (newpage,page_max,extra)
3348 if noSRAM_default_addr+noSRAM_lex_offset+size > noSRAM_himem:
3349 reloc_addr = noSRAM_himem-noSRAM_lex_offset-size
3350 reloc_addr -= (reloc_addr%256)
3351 if reloc_addr >= noSRAM_min_addr:
3352 instr = special_relocate_instructions(reloc_addr)
3353 if instr==False: print ("This lexicon is too big for Speech in main RAM even with relocation, unless RELOCAT is rewritten to work from files.")
3354 else:
3355 bbcStart = reloc_addr+noSRAM_lex_offset
3356 reloc_call = reloc_addr + 0xB00
3357 print ("This lexicon is too big for Speech at its default address of &%X, but you could use RELOCAT to put a version at &%X and then initialise it with CALL %s (or do the suggested *SAVE, reset, and run *SP). Be sure to set HIMEM=&%X. Then *LOAD %s %X or change the relocated SP file from offset &%X.%s" % (noSRAM_default_addr,reloc_addr,bbcshortest(reloc_call),reloc_addr,fname,bbcStart,noSRAM_lex_offset,instr))
3358 else: print ("This lexicon is too big for Speech in main RAM even with relocation.")
3359 else: # fits at default location - no relocation needed
3360 bbcStart = noSRAM_default_addr+noSRAM_lex_offset
3361 print ("You can load this lexicon by *LOAD %s %X or change the SPEECH file from offset &%X. Suggest you also set HIMEM=&%X for safety." % (fname,bbcStart,noSRAM_lex_offset,noSRAM_default_addr))
3362 if bbcStart: # we managed to fit it into main RAM
3363 f = open(fname)
3364 keys = bbcKeystrokes(getBuf(f).read(),bbcStart)
3365 f = open(fname+".key","w")
3366 getBuf(f).write(keys)
3367 del f
3368 print ("For ease of transfer to emulators etc, a self-contained keystroke file for putting %s data at &%X has been written to %s.key" % (fname,bbcStart,fname))
3369 if len(keys) > 32767: print ("(This file looks too big for BeebEm to paste though)") # see comments elsewhere
3370 # Instructions for replacing lex in SRAM:
3371 if size > SRAM_max-SRAM_lex_offset: print ("This lexicon is too big for Speech in Sideways RAM.") # unless you can patch Speech to run in SRAM but read its lexicon from main RAM, or run in main RAM but page in multiple banks of SRAM for the lexicon (but even then there'll be a limit)
3372 else: print ("You can load this lexicon into Sideways RAM by *SRLOAD %s %X 7 (or whichever bank number you're using), or change the SP8000 file from offset &%X." % (fname,SRAM_lex_offset+0x8000,SRAM_lex_offset))
3373 if not os.environ.get("SPEECH_DISK",""): print ("If you want to append the default lexicon to this one, set SPEECH_DISK to the image of the original Speech disk before running lexconvert, e.g. export SPEECH_DISK=/usr/local/BeebEm3/diskimg/Speech.ssd")
3374 if size <= SRAM_max-SRAM_lex_offset: print ("You can also set MAKE_SPEECH_ROM=1 (along with SPEECH_DISK) to create a SPEECH.ROM file instead")
3375 print ("If you get 'Mistake in speech' when testing some words, try starting with '*SAY, ' (this seems to be a Speech bug)") # - can't track down which words it does and doesn't apply to
3376 print ("It might be better to load your lexicon into eSpeak and use lexconvert's --phones option to drive the BBC with phonemes.")
3377
3378 def mainopt_version(i):
3379 # TODO: doc string for the help? (or would this option clutter it needlessly) - just print lexconvert's version number and nothing else
3380 print (__doc__.split("\n")[0].split(" - ")[0])
3381
3382 def main():
3383 """Introspect the module to find the mainopt_ functions, and either call one of them or print the help. Returns the error code to send back to the OS."""
3384 def funcToOpt(n): return "--"+n[n.index("_")+1:].replace("_","-")
3385 for k,v in globals().items():
3386 if k.startswith('mainopt_') and funcToOpt(k) in sys.argv:
3387 try: msg = v(sys.argv.index(funcToOpt(k)))
3388 except Message:
3389 # Python 2.6+ can have "except Message as e",
3390 # but Python 2.5 has to have "except Message,e"
3391 # which is disallowed in Python 3, so
3392 msg=sys.exc_info()[1].message
3393 if msg:
3394 sys.stdout.flush()
3395 sys.stderr.write(msg+"\n") ; return 1
3396 else: return 0
3397 html = ('--htmlhelp' in sys.argv) # (undocumented option used for my website, don't rely on it staying)
3398 def htmlify(h): return re.sub('(--[2A-Za-z-]*)',r'<kbd>\1</kbd>',h.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;').replace('\n','<br>'))
3399 if not html: htmlify = lambda x:x
3400 print (htmlify(__doc__))
3401 if html: missALine = "<p>"
3402 else: missALine = ""
3403 print (missALine)
3404 if '--formats' in sys.argv: # non-HTML mode only (format descriptions are included in HTML anyway, and don't worry about the capability summary)
3405 print ("Available pronunciation formats (and support levels):")
3406 keys=list(lexFormats.keys()) ; keys.sort()
3407 for k in keys:
3408 types = []
3409 if not k=="example": types.append("phones")
3410 if k=="mac-uk": types.append("speaking")
3411 else:
3412 if checkSetting(k,"lex_read_function"): types.append("lex-read")
3413 if checkSetting(k,"lex_filename") and checkSetting(k,"lex_entry_format"):
3414 ltype = checkSetting(k,"lex_type")
3415 if ltype: ltype=" as "+ltype
3416 types.append("lex-write"+ltype)
3417 print ("\n"+k+" ("+", ".join(types)+")")
3418 print (getSetting(k,"doc"))
3419 return 0
3420 elif html:
3421 print ("Available pronunciation formats:")
3422 if html: print ('<table id="formats">')
3423 keys=list(lexFormats.keys()) ; keys.sort()
3424 for k in keys: print ('<tr><td valign="top"><nobr>'+k+'</nobr></td><td valign="top">'+htmlify(getSetting(k,"doc"))+"</td></tr>")
3425 print ("</table><script><!-- try to be more readable on some smartphones\nif(((screen && screen.width<600) || navigator.userAgent.slice(-6)==\"Gecko/\" /* UC Browser? */) && document.getElementById && document.getElementById('formats').outerHTML) document.getElementById('formats').outerHTML = document.getElementById('formats').outerHTML.replace(/<table/g,'<dl').replace(/<.table/g,'<'+'/dl').replace(/<tr><td/g,'<dt').replace(/<.td><td/g,'<'+'/dt><dd').replace(/<.td><.tr/g,'<'+'/dd');\n//--></script>")
3426 else: print ("Available pronunciation formats: "+", ".join(sorted(list(lexFormats.keys())))+"\n(Use --formats to see their descriptions)")
3427 print (missALine)
3428 print ("Program options:")
3429 print (missALine)
3430 if html: print ("<dl>")
3431 for _,opt,desc in sorted([(not not v.__doc__ and not v.__doc__.startswith('*'),k,v.__doc__) for k,v in globals().items()]):
3432 if not opt.startswith("mainopt_"): continue
3433 opt = funcToOpt(opt)
3434 if not desc: continue # undocumented option
3435 params,rest = desc.split("\n",1)
3436 if params.startswith('*'): params=params[1:]
3437 if params: opt += (' '+params)
3438 if html: print ("<dt>"+htmlify(opt)+"</dt><dd>"+htmlify(rest)+"</dd>")
3439 else: print (opt+"\n"+rest+"\n")
3440 if html: print ("</dl>")
3441 return 0
3442
3443 catchingSigs = inSigHandler = False
3444 def catchSignals():
3445 "We had better try to catch all signals if using MacBritish_System_Lexicon so we can safely clean it up. We raise KeyboardInterrupt instead (need to catch this). Might not work with multithreaded code."
3446 global catchingSigs
3447 if catchingSigs: return
3448 catchingSigs = True
3449 import signal
3450 def f(sigNo,*args):
3451 global inSigHandler
3452 if inSigHandler: return
3453 inSigHandler = True
3454 os.killpg(os.getpgrp(),sigNo)
3455 sys.stderr.write("\nCaught signal %d\n" % sigNo)
3456 raise KeyboardInterrupt
3457 for n in xrange(1,signal.NSIG):
3458 if not n in [
3459 signal.SIGCHLD, # sent on subprocess completion
3460 signal.SIGTSTP,signal.SIGCONT, # Ctrl-Z / fg
3461 signal.SIGWINCH, # window-size change
3462 ] and not signal.getsignal(n)==signal.SIG_IGN:
3463 try: signal.signal(n,f)
3464 except: pass
3465 class MacBritish_System_Lexicon(object):
3466 """Overwrites some of the pronunciations in the system
3467 lexicon (after backing up the original). Cannot
3468 change the actual words in the system lexicon, so just
3469 alters pronunciations of words you don't intend to use
3470 so you can substitute these into your texts.
3471 Restores the lexicon on close()."""
3472 instances = {}
3473 def __init__(self,text="",voice="Daniel"):
3474 """text is the text you want to speak (so that any
3475 words used in it that are not mentioned in your
3476 lexicon are unchanged in the system lexicon);
3477 text="" means you just want to speak phonemes.
3478 Special value of text=False means lexicon read only.
3479 voice can be Daniel, Emily or Serena."""
3480 self.voice = False
3481 if not text==False:
3482 assert not voice in MacBritish_System_Lexicon.instances, "There is already another instance of MacBritish_System_Lexicon for the "+voice+" voice"
3483 assert not os.system("lockfile -1 -r 10 /tmp/"+voice+".PCMWave.lock") # in case some other process has it (note: if you run with python -O, this check won't happen!)
3484 self.voice = voice # (don't set this if text==False, since we won't need cleanup on __del__)
3485 self.filename = "/System/Library/Speech/Voices/"+voice+".SpeechVoice/Contents/Resources/PCMWave"
3486 assert not (not os.path.exists(self.filename) and os.path.exists("/System/Library/Speech/Voices/"+voice+"Compact.SpeechVoice/Contents/Resources/PCMWave")), "The only installation of "+voice+" found on this system was the Compact one, which lexconvert does not yet support" # TODO: could try self.wordIndexStart = findW("Abiquiu"),self.phIndexStart = findW("'@b.Ik.ju"),self.wordIndexEnd = findW("www.youtube.com",1),self.phIndexEnd = findW("'d^b.l.ju.'d^b.l.ju.'d^b.l.ju.dA+t.'ju.'tjub.dA+t.kA+m",1), but "t" in phones should be ignored, "activesync" and "afterlife" have no phones, "aqua" has TWO sets of phonemes (aquarium ok) and there are other synchronization issues.
3487 # TODO: some sync issues persist even on the NON-Compact version in newer versions of macOS (e.g. 10.12). This currently leads to exceptions in findW on such systems (which do say it could be due to wrong version of the voice); fixing would need looking at more sync issues as above
3488 assert os.path.exists(self.filename),"Cannot find an installation of '"+voice+"' on this system"
3489 if os.path.exists(self.filename+"0"):
3490 if text==False: self.filename += "0" # (use the backup file for read-only, if we created one before; this means we don't have to worry about locks)
3491 elif not text==False: # create a backup
3492 sys.stderr.write("Backing up "+self.filename+" to "+self.filename+"0...\n") # (you'll need a password if you're not running as root)
3493 err = os.system("sudo mv \""+self.filename+"\" \""+self.filename+"0\"; sudo cp \""+self.filename+"0\" \""+self.filename+"\"; sudo chown "+str(os.getuid())+" \""+self.filename+"\"")
3494 assert not err, "Error creating backup"
3495 lexFile = self.filename+".lexdir"
3496 if not os.path.exists(lexFile) and not text==False:
3497 sys.stderr.write("Creating lexdir file...\n")
3498 err = os.system("sudo touch \""+lexFile+"\" ; sudo chown "+str(os.getuid())+" \""+lexFile+"\"")
3499 assert not err, "Error creating lexdir"
3500 compat_err = "\nThis probably means your Mac has a new version of the voice that is no longer compatible with this system-lexicon patch."
3501 import cPickle
3502 if os.path.exists(lexFile) and os.stat(lexFile).st_size: self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd = cPickle.Unpickler(open(lexFile)).load()
3503 else:
3504 f = open(self.filename)
3505 dat = getBuf(f).read()
3506 def findW(word,rtnPastEnd=0):
3507 i = re.finditer(re.escape(word+chr(0)),dat)
3508 try: n = i.next()
3509 except StopIteration: raise Exception(word+" not found in voice file"+compat_err)
3510 try:
3511 n2 = i.next()
3512 raise Exception("%s does not uniquely identify a byte position (has at least %d and %d)%s" % (word,n.start(),n2.start(),compat_err))
3513 except StopIteration: pass
3514 if rtnPastEnd: return n.end()
3515 else: return n.start()
3516 self.wordIndexStart = findW("808s")
3517 self.phIndexStart = findW("'e&It.o&U.e&Its")
3518 self.wordIndexEnd = findW("zombie",1)
3519 self.phIndexEnd = findW("'zA+m.bI",1)
3520 if not text==False: cPickle.Pickler(open(lexFile,"w")).dump((self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd))
3521 if text==False: self.dFile = open(self.filename)
3522 else: self.dFile = open(self.filename,'r+')
3523 assert len(self.allWords()) == len(self.allPh()), str(len(self.allWords()))+" words but "+str(len(self.allPh()))+" phonemes"+compat_err
3524 self.textToAvoid = u""
3525 if text==False: return
3526 MacBritish_System_Lexicon.instances[voice] = self
3527 self.textToAvoid = text.decode('utf-8').replace(unichr(160),' ') ; self.restoreDic = {}
3528 catchSignals()
3529 def allWords(self):
3530 "Returns a list of words that are defined in the system lexicon (which won't be changed, but see allPh)"
3531 self.dFile.seek(self.wordIndexStart)
3532 return [x for x in getBuf(self.dFile).read(self.wordIndexEnd-self.wordIndexStart).split(chr(0)) if x]
3533 def allPh(self):
3534 "Returns a list of (file position, phoneme string) for each of the primary phoneme entries from the system lexicon. These entries can be changed in-place by writing to the said file position, and then spoken by giving the voice the corresponding word from allWords (but see also usable_words)."
3535 self.dFile.seek(self.phIndexStart)
3536 def f(l):
3537 last = None ; r = [] ; pos = self.phIndexStart
3538 for i in l:
3539 if re.search(r'[ -~]',i) and not i in ["'a&I.'fo&Un","'lI.@n","'so&Un.j$"] and not (i==last and i in ["'tR+e&I.si"]): r.append((pos,i)) # (the listed pronunciations are secondary ones that for some reason are in the list)
3540 if re.search(r'[ -~]',i): last = i
3541 pos += (len(i)+1) # +1 for the \x00
3542 assert pos==self.phIndexEnd+1 # +1 because the last \00 will result in a "" item after; the above +1 will be incorrect for that item
3543 return r
3544 return f([x for x in getBuf(self.dFile).read(self.phIndexEnd-self.phIndexStart).split(chr(0))])
3545 def usable_words(self,words_ok_to_redefine=[]):
3546 "Returns a list of (word,phoneme_file_position,original_phonemes) by combining allWords with allPh, but omitting any words that don't seem 'usable' (for example words that contain spaces, since these lexicon entries don't seem to be actually used by the voice). Words that occur in self.textToAvoid are also considered non-usable, unless they also occur in words_ok_to_redefine (user lexicon)."
3547 for word,(pos,phonemes) in zip(self.allWords(),self.allPh()):
3548 if not re.match("^[a-z0-9]*$",word): continue # it seems words not matching this regexp are NOT used by the engine
3549 if not (phonemes and 32<ord(phonemes[:1])<127): continue # better not touch those, just in case
3550 if word in self.textToAvoid and not word in words_ok_to_redefine: continue
3551 yield word,pos,phonemes
3552 def check_redef(self,wordsAndPhonemes):
3553 "Diagnostic function to list on standard error the 'redefinitions' we want to make. wordsAndPhonemes is a list of (original system-lexicon word, proposed new phonemes). The old phonemes are also listed, fetched from allPh."
3554 aw = self.allWords() ; ap = 0
3555 for w,p in wordsAndPhonemes:
3556 w = w.lower()
3557 if not re.match("^[a-z0-9]*$",w): continue
3558 if not w in aw: continue
3559 if not ap:
3560 ap = self.allPh()
3561 sys.stderr.write("Warning: some words were already in system lexicon\nword\told\tnew\n")
3562 sys.stderr.write(w+"\t"+ap[aw.index(w)][1]+"\t"+p+"\n")
3563 def speakPhones(self,phonesList):
3564 "Speaks every phonetic word in phonesList"
3565 words = [str(x)+"s" for x in range(len(phonesList))]
3566 d = self.setMultiple(words,phonesList)
3567 msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')
3568 getBuf(msc).write(as_utf8(" ").join(d.get(w,as_utf8("")) for w in words))
3569 def readWithLex(self,lex):
3570 "Reads the text given in the constructor after setting up the lexicon with the given (word,phoneme) list"
3571 # self.check_redef(lex) # uncomment if you want to know about these
3572 textToPrint = u' '+self.textToAvoid+u' '
3573 tta = ' '+self.textToAvoid.replace(u'\u2019',"'").replace(u'\u2032','').replace(u'\u00b4','').replace(u'\u02b9','').replace(u'\u00b7','').replace(u'\u2014',' ')+' ' # (ignore pronunciation marks 2032 and b7 that might be in the text, but still print them in textToPrint; also normalise apostrophes but not in textToPrint, and be careful with dashes as lex'ing the word after a hyphen or em-dash won't work BUT we still want to support hyphenated words IN the lexicon, so em-dashes are replaced here and hyphens are included in nonWordBefore below)
3574 words2,phonemes2 = [],[] # keep only the ones actually used in the text (no point setting whole lexicon)
3575 nonWordBefore=r"(?i)(?<=[^A-Za-z"+chr(0)+"-])" # see below for why chr(0) is included, and see comment above for why hyphen is at the end; (?i) = ignore case
3576 nonWordAfter=r"(?=([^A-Za-z'"+unichr(0x2019)+"-]|['"+unichr(0x2019)+r"-][^A-Za-z]))" # followed by non-letter non-apostrophe, or followed by apostrophe non-letter (so not if followed by "'s", because the voice won't use our custom lex entry if "'s" is added to the lex'd word, TODO: automatically add "'s" versions to the lexicon via +s or +iz?) (also not if followed by hyphen-letters; hyphen before start is handled above, although TODO preceded by non-letter + hyphen might be OK)
3577 ttal = tta.lower()
3578 for ww,pp in lex:
3579 ww = ww.decode('utf-8') # so you can add words with accents etc (in utf-8) to the lexicon
3580 if ww.lower() in ttal and re.search(nonWordBefore+re.escape(ww)+nonWordAfter,tta):
3581 words2.append(ww) ; phonemes2.append(pp)
3582 for k,v in self.setMultiple(words2,phonemes2).iteritems():
3583 tta = re.sub(nonWordBefore+re.escape(k)+nonWordAfter,chr(0)+v,tta)
3584 textToPrint = re.sub(nonWordBefore+'('+u'[\u2032\u00b4\u02b9\u00b7]*'.join(re.escape(c) for c in k)+')'+nonWordAfter,chr(0)+r'\1'+chr(1),textToPrint)
3585 tta = tta.replace(chr(0),'')
3586 term = os.environ.get("TERM","")
3587 if ("xterm" in term or term=="screen") and sys.stdout.isatty(): # we can probably underline words (inverse is more widely supported than underline, e.g. should work even on an old Linux console in case someone's using that to control an OS X server, but there might be a *lot* of words, which wouldn't be very good in inverse if user needs dark background and inverse is bright. Unlike Annogen, we're dealing primarily with Latin letters.)
3588 import textwrap
3589 textwrap.len = lambda x: len(x.replace(chr(0),"").replace(chr(1),"")) # a 'hack' to make (at least the 2.x implementations of) textwrap ignore our chr(0) and chr(1) markers in their calculations. Relies on textwrap calling len().
3590 print (textwrap.fill(textToPrint,stdout_width_unix(),break_on_hyphens=False).encode('utf-8').replace(chr(0),"\x1b[4m").replace(chr(1),"\x1b[0m").strip()) # break_on_hyphens=False because we don't really want hyphenated NAMES to be split across lines, and anyway textwrap in (at least) Python 2.7 has a bug that sometimes causes a line breaks to be inserted before a syllable marker symbol like 'prime'
3591 # else don't print anything (saves confusion)
3592 msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')
3593 getBuf(msc).write(tta.encode('utf-8'))
3594 def setMultiple(self,words,phonemes):
3595 "Sets phonemes for words, returning dict of word to substitute word. Flushes file buffer before return."
3596 avail = [] ; needed = []
3597 for word,pos,phon in self.usable_words(words):
3598 avail.append((len(phon),word,pos,phon))
3599 for word,phon in zip(words,phonemes):
3600 needed.append((len(phon),word,phon))
3601 avail.sort() ; needed.sort() # shortest phon first
3602 i = 0 ; wDic = {} ; iDone=set() ; mustBeAlpha=True
3603 # mustBeAlpha: prefer alphabetical words, since
3604 # these can be capitalised at start of sentence
3605 # (the prosody doesn't always work if it isn't)
3606 for l,word,phon in needed:
3607 while avail[i][0] < l or (mustBeAlpha and not re.match(as_utf8("[A-Za-z]"),avail[i][1])) or i in iDone:
3608 i += 1
3609 if i==len(avail):
3610 if mustBeAlpha: # desperate situation: we HAVE to use the non-alphabetical slots now (ideally we should pick words that never occur at start of sentence for them, but this branch is hopefully a rare situation in practice)
3611 mustBeAlpha=False ; i=0; continue
3612 sys.stderr.write("Could not find enough lexicon slots!\n") # TODO: we passed 'words' to usable_words's words_ok_to_redefine - this might not be the case if we didn't find enough slots
3613 self.dFile.flush() ; return wDic
3614 iDone.add(i)
3615 _,wSubst,pos,oldPhon = avail[i] ; i += 1
3616 if avail[i][2] in self.restoreDic: oldPhon=None # shouldn't happen if setMultiple is called only once, but might be useful for small experiments in the Python interpreter etc
3617 self.set(pos,phon,oldPhon)
3618 wDic[word] = wSubst[:1].upper()+wSubst[1:] # always capitalise it so it can be used at start of sentence too (TODO: copy original capitalisation of each instance instead, in case it happens to come directly after a dotted abbreviation? although if it's something that's always capitalised anyway, e.g. most names, then this won't make any difference)
3619 self.dFile.flush() ; return wDic
3620 def set(self,phPos,val,old=None):
3621 """Sets phonemes at position phPos to new value.
3622 Caller should flush the file buffer when done."""
3623 # print "Debugger: setting %x to %s" % (phPos,val)
3624 if old:
3625 assert not phPos in self.restoreDic, "Cannot call set() twice on same phoneme while re-specifying 'old'"
3626 assert len(val) <= len(old), "New phoneme is too long!"
3627 self.restoreDic[phPos] = old
3628 else: assert phPos in self.restoreDic, "Must specify old values (for restore) when setting for first time"
3629 self.dFile.seek(phPos)
3630 getBuf(self.dFile).write(val+as_utf8(chr(0)))
3631 def __del__(self):
3632 "WARNING - this might not be called before exit - best to call close() manually"
3633 if not self.voice: return
3634 self.close()
3635 def close(self):
3636 for phPos,val in self.restoreDic.items():
3637 self.set(phPos,val)
3638 self.dFile.close()
3639 del MacBritish_System_Lexicon.instances[self.voice]
3640 assert not os.system("rm -f /tmp/"+self.voice+".PCMWave.lock")
3641 self.voice=None
3642 def stdout_width_unix(): # assumes isatty
3643 import struct,fcntl,termios
3644 return struct.unpack('hh', fcntl.ioctl(1,termios.TIOCGWINSZ,'1234'))[1]
3645
3646 lexFormats = LexFormats() # at end, in case it refers to anything that was defined later
3647
3648 if __name__ == "__main__": sys.exit(main())