unitproc/bkipas.d/lexconvert.py

   1 #!/usr/bin/env python
   2 # May be run with either Python 2 or Python 3
   3
   4 """lexconvert v0.32 - convert phonemes between different speech synthesizers etc
   5 (c) 2007-20 Silas S. Brown.  License: GPL"""
   6
   7 # Run without arguments for usage information
   8
   9 #    This program is free software; you can redistribute it and/or modify
  10 #    it under the terms of the GNU General Public License as published by
  11 #    the Free Software Foundation; either version 3 of the License, or
  12 #    (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU General Public License for more details.
  18
  19 # Old versions of this code are being kept in the E-GuideDog SVN repository at
  20 # http://svn.code.sf.net/p/e-guidedog/code/ssb22/lexconvert
  21 # and on GitHub at https://github.com/ssb22/lexconvert
  22 # and on GitLab at https://gitlab.com/ssb22/lexconvert
  23 # and on Bitbucket https://bitbucket.org/ssb22/lexconvert
  24 # and at https://gitlab.developers.cam.ac.uk/ssb22/lexconvert
  25 # although some early ones are missing.
  26
  27 def Phonemes():
  28    """Create phonemes by calling vowel(), consonant(),
  29      variant() and other().
  30
  31      For the variants, if a particular variant does not
  32      exist in the destination format then we will treat it
  33      as equivalent to the last non-variant we created.
  34
  35      For anything else that does not exist in the
  36      destination format, we will first try to break the
  37      source's phoneme into parts (e.g. see the treatment
  38      of opt_ol_as_in_gold by eSpeak and bbcmicro), and if
  39      that still doesn't work then we drop a character
  40      (warning depending on the source format's setting of
  41      safe_to_drop_characters).  makeDic does however warn
  42      about any non-variant consonants, or non-variant
  43      vowels that weren't marked optional, missing from a
  44      format. """
  45    a_as_in_ah = vowel()
  46    _, var1_a_as_in_ah = variant()
  47    _, var3_a_as_in_ah = variant()
  48    _, var4_a_as_in_ah = variant()
  49    _, var5_a_as_in_ah = variant()
  50    a_as_in_apple = vowel()
  51    u_as_in_but = vowel() # or the first part of un as in hunt
  52    _, var1_u_as_in_but = variant()
  53    o_as_in_orange = vowel()
  54    _, var1_o_as_in_orange = variant()
  55    _, var2_o_as_in_orange = variant()
  56    o_as_in_now = vowel()
  57    _, var1_o_as_in_now = variant()
  58    a_as_in_ago = vowel()
  59    _, var1_a_as_in_ago = variant()
  60    e_as_in_herd = vowel()
  61    _, ar_as_in_year = variant()
  62    eye = vowel()
  63    _, var1_eye = variant()
  64    b = consonant()
  65    ch = consonant()
  66    d = consonant()
  67    th_as_in_them = consonant()
  68    e_as_in_them = vowel()
  69    _, var1_e_as_in_them = variant()
  70    a_as_in_air = vowel()
  71    _, var1_a_as_in_air = variant()
  72    _, var2_a_as_in_air = variant()
  73    _, var3_a_as_in_air = variant()
  74    _, var4_a_as_in_air = variant()
  75    a_as_in_ate = vowel()
  76    _, var1_a_as_in_ate = variant()
  77    f = consonant()
  78    g = consonant()
  79    h = consonant()
  80    i_as_in_it = vowel()
  81    _, var1_i_as_in_it = variant()
  82    _, var2_i_as_in_it = variant()
  83    ear = vowel()
  84    _, var1_ear = variant()
  85    _, var2_ear = variant()
  86    e_as_in_eat = vowel()
  87    _, var1_e_as_in_eat = variant()
  88    j_as_in_jump = consonant()
  89    k = consonant()
  90    _, opt_scottish_loch = variant()
  91    l = consonant()
  92    _, var1_l = variant()
  93    m = consonant()
  94    n = consonant()
  95    ng = consonant()
  96    o_as_in_go = vowel()
  97    _, var1_o_as_in_go = variant()
  98    _, var2_o_as_in_go = variant()
  99    opt_ol_as_in_gold = opt_vowel() # see eSpeak / bbcmicro
 100    oy_as_in_toy = vowel()
 101    _, var1_oy_as_in_toy = variant()
 102    p = consonant()
 103    r = consonant()
 104    _, var1_r = variant()
 105    s = consonant()
 106    sh = consonant()
 107    t = consonant()
 108    _, var1_t = variant()
 109    th_as_in_think = consonant()
 110    oor_as_in_poor = vowel()
 111    _, var1_oor_as_in_poor = variant()
 112    _, opt_u_as_in_pull = variant()
 113    opt_ul_as_in_pull = opt_vowel() # see eSpeak / bbcmicro
 114    oo_as_in_food = vowel()
 115    _, var1_oo_as_in_food = variant()
 116    _, var2_oo_as_in_food = variant()
 117    close_to_or = vowel()
 118    _, var1_close_to_or = variant()
 119    _, var2_close_to_or = variant()
 120    _, var3_close_to_or = variant()
 121    v = consonant()
 122    w = consonant()
 123    _, var1_w = variant()
 124    y = consonant()
 125    z = consonant()
 126    ge_of_blige_etc = consonant()
 127    glottal_stop = other()
 128    syllable_separator = other()
 129    _, primary_stress = variant()
 130    _, secondary_stress = variant()
 131    text_sharp = other()
 132    text_underline = other()
 133    text_question = other()
 134    text_exclamation = other()
 135    text_comma = other()
 136    ipa_colon = other() # for catching missed cases
 137    del _ ; return locals()
 138
 139 def LexFormats():
 140   """Makes the phoneme conversion tables of each format.
 141      Each table has string to phoneme entries and phoneme
 142      to string entries.  The string to phoneme entries are
 143      used when converting OUT of that format, and the
 144      phoneme to string entries are used when converting IN
 145      (so you can recognise phonemes you don't support and
 146      convert them to something else).  By default, a tuple
 147      of the form (string,phoneme) will create entries in
 148      BOTH directions; one-directional entries are created
 149      via (string,phoneme,False) or (phoneme,string,False).
 150      The makeDic function checks the keys are unique.
 151
 152      First parameter is always a description of the
 153      format, then come the phoneme entries as described
 154      above, then any additional settings:
 155
 156        stress_comes_before_vowel (default False means any
 157        stress mark goes AFTER the affected vowel; set to
 158        True if the format requires stress placed before)
 159
 160        word_separator (default same as phoneme_separator)
 161        phoneme_separator (default " ")
 162        clause_separator (default newline)
 163
 164        (For a special case, clause_separator can also be
 165         set to a function.  If that happens, the function
 166         will be called whenever lexconvert needs to output
 167         a list of (lists of words) in this format.  See
 168         bbcmicro for an example function clause_separator)
 169
 170        safe_to_drop_characters (default False, can be a
 171        string of safe characters or True = all; controls
 172        warnings when unrecognised characters are found)
 173
 174        approximate_missing (default False) - if True,
 175        makeDic will attempt to compensate for missing
 176        phonemes by approximating them to others, instead of
 177        warning about them.  This is useful for American codes
 178        that can't cope with all the British English phonemes.
 179        (Approximation is done automatically anyway in the
 180        case of variant phonemes; approximate_missing adds in
 181        some additional approximations - see comments in code)
 182
 183        cleanup_regexps (default none) - optional list of
 184        (search,replace) regular expressions to "clean up"
 185        after converting each word INTO this format
 186        cleanup_func (default none) - optional special-case
 187        function to pass result through after cleanup_regexps
 188
 189        cvtOut_regexps (default none) - optional list of
 190        (search,replace) regular expressions to "clean up"
 191        before starting to convert OUT of this format
 192        cvtOut_func (default none) - optional special-case
 193        function to pass through before any cvtOut_regexps
 194
 195        inline_format (default "%s") the format string for
 196        printing a word with --phones or --phones2phones
 197        (can be used to put markup around each word)
 198        (can also be a function taking the phonetic word
 199         and returning the resulting string, e.g. bbcmicro)
 200
 201        output_is_binary (default False) - True if the output
 202        is almost certainly unsuitable for a terminal; will
 203        cause lexconvert to refuse to print phonemes unless
 204        its standard output is redirected to a file or pipe
 205        (affects the --phones and --phones2phones options)
 206
 207        inline_header (default none) text to print first
 208          when outputting from --phones or --phones2phones
 209        inline_footer (default none) text to print last
 210        inline_oneoff_header (default none) text to print
 211          before inline_header on the first time only
 212
 213        lex_filename - filename of a lexicon file.  If this
 214        is not specified, there is no support for writing a
 215        lexicon in this format: there can still be READ
 216        support if you define lex_read_function to open the
 217        lexicon by itself, but otherwise the format can be
 218        used only with --phones and --phones2phones.
 219
 220        lex_entry_format - format string for writing each
 221        (word, pronunciation) entry to the lexicon file.
 222        This is also needed for lexicon-write support.
 223
 224        lex_header, lex_footer - optional strings to write
 225        at the beginning and at the end of the lexicon file
 226        (can also be functions that take the open file as a
 227         parameter, e.g. for bbcmicro; lex_footer is
 228         allowed to close the file if it needs to do
 229         something with it afterwards)
 230
 231        lex_word_case - optional "upper" or "lower" to
 232        force a particular case for lexicon words (not
 233        pronunciations - they're determined by the table).
 234        The default is to allow words to be in either case.
 235
 236        lex_type (default "") - used by the --formats
 237        option when summarising the support for each format
 238
 239        lex_read_function - Python function to READ the
 240        lexicon file and return a (word,phonemes) list.
 241        If this is not specified, there's no read support
 242        for lexicons in this format (but there can still be
 243        write support - see above - and you can still use
 244        --phones and --phones2phones).  If lex_filename is
 245        specified then this function will be given the open
 246        file as a parameter. """
 247
 248   phonemes = Phonemes() ; globals().update(phonemes)
 249   return { "festival" : makeDic(
 250     "Festival's British voice",
 251     ('0',syllable_separator),
 252     ('1',primary_stress),
 253     ('2',secondary_stress),
 254     ('aa',a_as_in_ah),
 255     ('a',a_as_in_apple),
 256     ('uh',u_as_in_but),
 257     ('o',o_as_in_orange),
 258     ('au',o_as_in_now),
 259     ('@',a_as_in_ago),
 260     ('@@',e_as_in_herd),
 261     ('ai',eye),
 262     ('b',b),
 263     ('ch',ch),
 264     ('d',d),
 265     ('dh',th_as_in_them),
 266     ('e',e_as_in_them),
 267     (ar_as_in_year,'@@',False),
 268     ('e@',a_as_in_air),
 269     ('ei',a_as_in_ate),
 270     ('f',f),
 271     ('g',g),
 272     ('h',h),
 273     ('i',i_as_in_it),
 274     ('i@',ear),
 275     ('ii',e_as_in_eat),
 276     ('jh',j_as_in_jump),
 277     ('k',k),
 278     ('l',l),
 279     ('m',m),
 280     ('n',n),
 281     ('ng',ng),
 282     ('ou',o_as_in_go),
 283     ('oi',oy_as_in_toy),
 284     ('p',p),
 285     ('r',r),
 286     ('s',s),
 287     ('sh',sh),
 288     ('t',t),
 289     ('th',th_as_in_think),
 290     ('u@',oor_as_in_poor),
 291     ('u',opt_u_as_in_pull),
 292     ('uu',oo_as_in_food),
 293     ('oo',close_to_or),
 294     ('v',v),
 295     ('w',w),
 296     ('y',y),
 297     ('z',z),
 298     ('zh',ge_of_blige_etc),
 299     lex_filename=ifset("HOME",os.environ.get("HOME","")+os.sep)+".festivalrc",
 300     lex_entry_format="(lex.add.entry '( \"%s\" n %s))\n",
 301     lex_header=";; -*- mode: lisp -*-\n(eval (list voice_default))\n",
 302     lex_read_function = lambda *args:eval('['+getoutput("grep -vi parameter.set < ~/.festivalrc | grep -v '(eval' | sed -e 's/;.*//' -e 's/.lex.add.entry//' -e s/\"'\"'[(] *\"/[\"/' -e 's/\" [^ ]* /\",(\"/' -e 's/\".*$/&\"],/' -e 's/[()]/ /g' -e 's/  */ /g'")+']'),
 303     safe_to_drop_characters=True, # TODO: really? (could instead give a string of known-safe characters)
 304     cleanup_func = festival_group_stress,
 305   ),
 306
 307   "example" : makeVariantDic(
 308     "A small built-in example lexicon for testing when you don't have your full custom lexicon to hand.  Use --convert to write it in one of the other formats and see if a synth can import it.",
 309     lex_read_function = lambda *args: [
 310        ("Shadrach","shei1drak"),
 311        ("Meshach","mii1shak"),
 312        ("Abednego","@be1dniigou"),
 313     ], cleanup_func = None,
 314     lex_filename=None, lex_entry_format=None, noInherit=True),
 315
 316   "festival-cmu" : makeVariantDic(
 317     "American CMU version of Festival",
 318     ('ae',a_as_in_apple),
 319     ('ah',u_as_in_but),
 320     ('ax',a_as_in_ago),
 321     (o_as_in_orange,'aa',False),
 322     ('aw',o_as_in_now),
 323     ('er',e_as_in_herd), # TODO: check this one
 324     ('ay',eye),
 325     ('eh',e_as_in_them),
 326     (ar_as_in_year,'er',False),
 327     (a_as_in_air,'er',False),
 328     ('ey',a_as_in_ate),
 329     ('hh',h),
 330     ('ih',i_as_in_it),
 331     ('ey ah',ear),
 332     ('iy',e_as_in_eat),
 333     ('ow',o_as_in_go),
 334     ('oy',oy_as_in_toy),
 335     ('uh',oor_as_in_poor),
 336     ('uw',oo_as_in_food),
 337     ('ao',close_to_or),
 338   ),
 339
 340   "espeak" : makeDic(
 341     "eSpeak's default British voice", # but eSpeak's phoneme representation isn't always that simple, hence the regexps at the end
 342     ('%',syllable_separator),
 343     ("'",primary_stress),
 344     (',',secondary_stress),
 345     # TODO: glottal_stop? (in regional pronunciations etc)
 346     ('A:',a_as_in_ah),
 347     ('A@',a_as_in_ah,False),
 348     ('A',var1_a_as_in_ah),
 349     ('a',a_as_in_apple),
 350     ('aa',a_as_in_apple,False),
 351     ('a2',a_as_in_apple,False), # TODO: this is actually an a_as_in_apple variant in espeak; festival @1 is not in mrpa PhoneSet
 352     ('&',a_as_in_apple,False),
 353     ('V',u_as_in_but),
 354     ('0',o_as_in_orange),
 355     ('aU',o_as_in_now),
 356     ('@',a_as_in_ago),
 357     ('a#',a_as_in_ago,False), # (TODO: eSpeak sometimes uses a# in 'had' when in a sentence, and this doesn't always sound good on other synths; might sometimes want to convert it to a_as_in_apple; not sure what contexts would call for this though)
 358     ('3:',e_as_in_herd),
 359     ('3',var1_a_as_in_ago),
 360     ('@2',a_as_in_ago,False),
 361     ('@-',a_as_in_ago,False), # (eSpeak @- sounds to me like a shorter version of @, TODO: double-check the relationship between @ and @2 in Festival)
 362     ('aI',eye),
 363     ('aI2',eye,False),
 364     ('aI;',eye,False),
 365     ('aI2;',eye,False),
 366     ('b',b),
 367     ('tS',ch),
 368     ('d',d),
 369     ('D',th_as_in_them),
 370     ('E',e_as_in_them),
 371     (ar_as_in_year,'3:',False),
 372     ('e@',a_as_in_air),
 373     ('eI',a_as_in_ate),
 374     ('f',f),
 375     ('g',g),
 376     ('h',h),
 377     ('I',i_as_in_it),
 378     ('I;',i_as_in_it,False),
 379     ('i',i_as_in_it,False),
 380     ('I2',var2_i_as_in_it,False),
 381     ('I2;',var2_i_as_in_it,False),
 382     ('i@',ear),
 383     ('i@3',var2_ear),
 384     ('i:',e_as_in_eat),
 385     ('i:;',e_as_in_eat,False),
 386     ('dZ',j_as_in_jump),
 387     ('k',k),
 388     ('x',opt_scottish_loch),
 389     ('l',l),
 390     ('L',l,False),
 391     ('m',m),
 392     ('n',n),
 393     ('N',ng),
 394     ('oU',o_as_in_go),
 395     ('oUl',opt_ol_as_in_gold), # (espeak says "gold" in a slightly 'posh' way though) (if dest format doesn't have opt_ol_as_in_gold, it'll get o_as_in_go + the l)
 396     ('OI',oy_as_in_toy),
 397     ('p',p),
 398     ('r',r),
 399     ('r-',r,False),
 400     ('s',s),
 401     ('S',sh),
 402     ('t',t),
 403     ('T',th_as_in_think),
 404     ('U@',oor_as_in_poor),
 405     ('U',opt_u_as_in_pull),
 406     ('@5',opt_u_as_in_pull,False),
 407     ('Ul',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the U, then the l
 408     ('u:',oo_as_in_food),
 409     ('O:',close_to_or),
 410     ('O@',var3_close_to_or),
 411     ('o@',var3_close_to_or,False),
 412     ('O',var3_close_to_or,False),
 413     ('v',v),
 414     ('w',w),
 415     ('j',y),
 416     ('z',z),
 417     ('Z',ge_of_blige_etc),
 418     lex_filename = "en_extra",
 419     lex_entry_format = "%s %s\n",
 420     lex_read_function = lambda lexfile: [x for x in [l.split()[:2] for l in lexfile.readlines()] if len(x)==2 and not '//' in x[0]],
 421     lex_footer=lambda f:(f.close(),os.system("espeak --compile=en")), # see also a bit of special-case code in mainopt_convert
 422     inline_format = "[[%s]]",
 423     word_separator=" ",phoneme_separator="",
 424     stress_comes_before_vowel=True,
 425     safe_to_drop_characters="_: !",
 426     cleanup_regexps=[
 427       ("k'a2n","k'@n"),
 428       ("ka2n","k@n"),
 429       ("gg","g"),
 430       ("@U","oU"), # (eSpeak uses oU to represent @U; difference is given by its accent parameters)
 431       ("([iU]|([AO]:))@r$","\1@"),
 432       ("([^e])@r",r"\1_remove_3"),("_remove_",""),
 433       # (r"([^iU]@)l",r"\1L") # only in older versions of espeak (not valid in more recent versions)
 434       ("rr$","r"),
 435       ("3:r$","3:"),
 436       ("%%+","%"),("^%",""),("%$",""),
 437       # TODO: 'declared' & 'declare' the 'r' after the 'E' sounds a bit 'regional' (but pretty).  but sounds incomplete w/out 'r', and there doesn't seem to be an E2 or E@
 438       # TODO: consider adding 'g' to words ending in 'N' (if want the 'g' pronounced in '-ng' words) (however, careful of words like 'yankee' where the 'g' would be followed by a 'k'; this may also be a problem going into the next word)
 439     ],
 440      cvtOut_regexps = [
 441        ("e@r$","e@"), ("e@r([bdDfghklmnNprsStTvwjzZ])",r"e@\1"), # because the 'r' is implicit in other synths (but DO have it if there's another vowel to follow)
 442      ],
 443   ),
 444
 445   "sapi" : makeDic(
 446     "Microsoft Speech API (American English)",
 447     ('-',syllable_separator),
 448     ('1',primary_stress),
 449     ('2',secondary_stress),
 450     ('aa',a_as_in_ah),
 451     ('ae',a_as_in_apple),
 452     ('ah',u_as_in_but),
 453     ('ao',o_as_in_orange),
 454     ('aw',o_as_in_now),
 455     ('ax',a_as_in_ago),
 456     ('er',e_as_in_herd),
 457     ('ay',eye),
 458     ('b',b),
 459     ('ch',ch),
 460     ('d',d),
 461     ('dh',th_as_in_them),
 462     ('eh',e_as_in_them),
 463     ('ey',var1_e_as_in_them),
 464     (a_as_in_ate,'ey',False),
 465     ('f',f),
 466     ('g',g),
 467     ('h',h), # Jan suggested 'hh', but I can't get this to work on Windows XP (TODO: try newer versions of Windows)
 468     ('ih',i_as_in_it),
 469     ('iy',e_as_in_eat),
 470     ('jh',j_as_in_jump),
 471     ('k',k),
 472     ('l',l),
 473     ('m',m),
 474     ('n',n),
 475     ('ng',ng),
 476     ('ow',o_as_in_go),
 477     ('oy',oy_as_in_toy),
 478     ('p',p),
 479     ('r',r),
 480     ('s',s),
 481     ('sh',sh),
 482     ('t',t),
 483     ('th',th_as_in_think),
 484     ('uh',oor_as_in_poor),
 485     ('uw',oo_as_in_food),
 486     ('AO',close_to_or),
 487     ('v',v),
 488     ('w',w),
 489     # ('x',var1_w), # suggested by Jan, but I can't get this to work on Windows XP (TODO: try newer versions of Windows)
 490     ('y',y),
 491     ('z',z),
 492     ('zh',ge_of_blige_etc),
 493     approximate_missing=True,
 494     lex_filename="run-ptts.bat", # write-only for now
 495     lex_header = "rem  You have to run this file\nrem  with ptts.exe in the same directory\nrem  to add these words to the SAPI lexicon\n\n",
 496     lex_entry_format='ptts -la %s "%s"\n',
 497     inline_format = '<pron sym="%s"/>',
 498     safe_to_drop_characters=True, # TODO: really?
 499   ),
 500
 501   "cepstral" : makeDic(
 502     "Cepstral's British English SSML phoneset",
 503     ('0',syllable_separator),
 504     ('1',primary_stress),
 505     ('a',a_as_in_ah),
 506     ('ae',a_as_in_apple),
 507     ('ah',u_as_in_but),
 508     ('oa',o_as_in_orange),
 509     ('aw',o_as_in_now),
 510     ('er',e_as_in_herd),
 511     ('ay',eye),
 512     ('b',b),
 513     ('ch',ch),
 514     ('d',d),
 515     ('dh',th_as_in_them),
 516     ('eh',e_as_in_them),
 517     ('e@',a_as_in_air),
 518     ('ey',a_as_in_ate),
 519     ('f',f),
 520     ('g',g),
 521     ('h',h),
 522     ('ih',i_as_in_it),
 523     ('i',e_as_in_eat),
 524     ('jh',j_as_in_jump),
 525     ('k',k),
 526     ('l',l),
 527     ('m',m),
 528     ('n',n),
 529     ('ng',ng),
 530     ('ow',o_as_in_go),
 531     ('oy',oy_as_in_toy),
 532     ('p',p),
 533     ('r',r),
 534     ('s',s),
 535     ('sh',sh),
 536     ('t',t),
 537     ('th',th_as_in_think),
 538     ('uh',oor_as_in_poor),
 539     ('uw',oo_as_in_food),
 540     ('ao',close_to_or),
 541     ('v',v),
 542     ('w',w),
 543     ('j',y),
 544     ('z',z),
 545     ('zh',ge_of_blige_etc),
 546     approximate_missing=True,
 547     lex_filename="lexicon.txt",
 548     lex_entry_format = "%s 0 %s\n",
 549     lex_read_function = lambda lexfile: [(word,pronunc) for word, ignore, pronunc in [l.split(None,2) for l in lexfile.readlines()]],
 550     lex_word_case = "lower",
 551     inline_format = "<phoneme ph='%s'>p</phoneme>",
 552     safe_to_drop_characters=True, # TODO: really?
 553     cleanup_regexps=[(" 1","1"),(" 0","0")],
 554   ),
 555
 556   "mac" : makeDic(
 557     "approximation in American English using the [[inpt PHON]] notation of Apple's US voices",
 558     ('=',syllable_separator),
 559     ('1',primary_stress),
 560     ('2',secondary_stress),
 561     ('AA',a_as_in_ah),
 562     ('aa',var5_a_as_in_ah),
 563     ('AE',a_as_in_apple),
 564     ('UX',u_as_in_but),
 565     (o_as_in_orange,'AA',False),
 566     ('AW',o_as_in_now),
 567     ('AX',a_as_in_ago),
 568     (e_as_in_herd,'AX',False), # TODO: is this really the best approximation?
 569     ('AY',eye),
 570     ('b',b),
 571     ('C',ch),
 572     ('d',d),
 573     ('D',th_as_in_them),
 574     ('EH',e_as_in_them),
 575     ('EY',a_as_in_ate),
 576     ('f',f),
 577     ('g',g),
 578     ('h',h),
 579     ('IH',i_as_in_it),
 580     ('IX',var2_i_as_in_it),
 581     ('IY',e_as_in_eat),
 582     ('J',j_as_in_jump),
 583     ('k',k),
 584     ('l',l),
 585     ('m',m),
 586     ('n',n),
 587     ('N',ng),
 588     ('OW',o_as_in_go),
 589     ('OY',oy_as_in_toy),
 590     ('p',p),
 591     ('r',r),
 592     ('s',s),
 593     ('S',sh),
 594     ('t',t),
 595     ('T',th_as_in_think),
 596     ('UH',oor_as_in_poor),
 597     ('UW',oo_as_in_food),
 598     ('AO',close_to_or),
 599     ('v',v),
 600     ('w',w),
 601     ('y',y),
 602     ('z',z),
 603     ('Z',ge_of_blige_etc),
 604     approximate_missing=True,
 605     lex_filename="substitute.sh", # write-only for now
 606     lex_type = "substitution script",
 607     lex_header = "#!/bin/bash\n\n# I don't yet know how to add to the Apple US lexicon,\n# so here is a 'sed' command you can run on your text\n# to put the pronunciation inline:\n\nsed -E -e :S \\\n",
 608     lex_entry_format=r" -e 's/(^|[^A-Za-z])%s($|[^A-Za-z[12=])/\1[[inpt PHON]]%s[[inpt TEXT]]\2/g'"+" \\\n",
 609     # but /g is non-overlapping matches and won't catch 2 words in the lex right next to each other with only one non-alpha in between, so we put :S at start and tS at end to make the whole operation repeat until it hasn't done any more substitutions (hence also the exclusion of [, 1, 2 or = following a word so it doesn't try to substitute stuff inside the phonemes; TODO: assert the lexicon does not contain "inpt", "PHON" or "TEXT")
 610     lex_footer = lambda f:(f.write(" -e tS\n"),f.close(),os.chmod("substitute.sh",493)), # 493 = 0755, but no way to specify octal that works on both Python 2.5 and Python 3 (0o works on 2.6+)
 611     inline_format = "[[inpt PHON]]%s[[inpt TEXT]]",
 612     word_separator=" ",phoneme_separator="",
 613     safe_to_drop_characters=True, # TODO: really?
 614   ),
 615
 616   "mac-uk" : makeDic(
 617     "Scansoft/Nuance British voices in Mac OS 10.7+ (system lexicon editing required, see --mac-uk option)",
 618     ('.',syllable_separator),
 619     ("'",primary_stress),
 620     (secondary_stress,'',False),
 621     ('A',a_as_in_ah),
 622     ('@',a_as_in_apple),
 623     ('$',u_as_in_but),
 624     (a_as_in_ago,'$',False),
 625     ('A+',o_as_in_orange),
 626     ('a&U',o_as_in_now),
 627     ('E0',e_as_in_herd),
 628     ('a&I',eye),
 629     ('b',b),
 630     ('t&S',ch),
 631     ('d',d),
 632     ('D',th_as_in_them),
 633     ('E',e_as_in_them),
 634     ('0',ar_as_in_year),
 635     ('E&$',a_as_in_air),
 636     ('e&I',a_as_in_ate),
 637     ('f',f),
 638     ('g',g),
 639     ('h',h),
 640     ('I',i_as_in_it),
 641     ('I&$',ear),
 642     ('i',e_as_in_eat),
 643     ('d&Z',j_as_in_jump),
 644     ('k',k),
 645     ('l',l),
 646     ('m',m),
 647     ('n',n),
 648     ('nK',ng),
 649     ('o&U',o_as_in_go),
 650     ('O&I',oy_as_in_toy),
 651     ('p',p),
 652     ('R+',r),
 653     ('s',s),
 654     ('S',sh),
 655     ('t',t),
 656     ('T',th_as_in_think),
 657     ('O',oor_as_in_poor),
 658     ('U',opt_u_as_in_pull),
 659     ('u',oo_as_in_food),
 660     (close_to_or,'O',False),
 661     ('v',v),
 662     ('w',w),
 663     ('j',y),
 664     ('z',z),
 665     ('Z',ge_of_blige_etc),
 666     # lex_filename not set (mac-uk code does not permanently save the lexicon; see --mac-uk option to read text)
 667     lex_read_function = lambda *args:[(w,p) for w,_,p in MacBritish_System_Lexicon(False,os.environ.get("MACUK_VOICE","Daniel")).usable_words()],
 668     inline_oneoff_header = "(mac-uk phonemes output is for information only; you'll need the --mac-uk or --trymac-uk options to use it)\n",
 669     word_separator=" ",phoneme_separator="",
 670     stress_comes_before_vowel=True,
 671     safe_to_drop_characters=True, # TODO: really?
 672     cleanup_regexps=[(r'o\&U\.Ol', r'o\&Ul')],
 673   ),
 674
 675   "x-sampa" : makeDic(
 676     "General X-SAMPA notation, contributed by Jan Weiss",
 677     ('.',syllable_separator),
 678     ('"',primary_stress),
 679     ('%',secondary_stress),
 680     ('A',a_as_in_ah),
 681     (':',ipa_colon),
 682     ('A:',var3_a_as_in_ah),
 683     ('Ar\\',var4_a_as_in_ah),
 684     ('a:',var5_a_as_in_ah),
 685     ('{',a_as_in_apple),
 686     ('V',u_as_in_but),
 687     ('Q',o_as_in_orange),
 688     (var1_o_as_in_orange,'A',False),
 689     ('O',var2_o_as_in_orange),
 690     ('aU',o_as_in_now),
 691     ('{O',var1_o_as_in_now),
 692     ('@',a_as_in_ago),
 693     ('3:',e_as_in_herd),
 694     ('aI',eye),
 695     ('Ae',var1_eye),
 696     ('b',b),
 697     ('tS',ch),
 698     ('d',d),
 699     ('D',th_as_in_them),
 700     ('E',e_as_in_them),
 701     ('e',var1_e_as_in_them),
 702     (ar_as_in_year,'3:',False),
 703     ('E@',a_as_in_air),
 704     ('Er\\',var1_a_as_in_air),
 705     ('e:',var2_a_as_in_air),
 706     ('E:',var3_a_as_in_air),
 707     ('e@',var4_a_as_in_air),
 708     ('eI',a_as_in_ate),
 709     ('{I',var1_a_as_in_ate),
 710     ('f',f),
 711     ('g',g),
 712     ('h',h),
 713     ('I',i_as_in_it),
 714     ('1',var1_i_as_in_it),
 715     ('I@',ear),
 716     ('Ir\\',var1_ear),
 717     ('i',e_as_in_eat),
 718     ('i:',var1_e_as_in_eat),
 719     ('dZ',j_as_in_jump),
 720     ('k',k),
 721     ('x',opt_scottish_loch),
 722     ('l',l),
 723     ('m',m),
 724     ('n',n),
 725     ('N',ng),
 726     ('@U',o_as_in_go),
 727     ('oU',var2_o_as_in_go),
 728     ('@}',var1_u_as_in_but),
 729     ('OI',oy_as_in_toy),
 730     ('oI',var1_oy_as_in_toy),
 731     ('p',p),
 732     ('r\\',r),
 733     (var1_r,'r',False),
 734     ('s',s),
 735     ('S',sh),
 736     ('t',t),
 737     ('T',th_as_in_think),
 738     ('U@',oor_as_in_poor),
 739     ('Ur\\',var1_oor_as_in_poor),
 740     ('U',opt_u_as_in_pull),
 741     ('}:',oo_as_in_food),
 742     ('u:',var1_oo_as_in_food),
 743     (var2_oo_as_in_food,'u:',False),
 744     ('O:',close_to_or),
 745     (var1_close_to_or,'O',False),
 746     ('o:',var2_close_to_or),
 747     ('v',v),
 748     ('w',w),
 749     ('W',var1_w),
 750     ('j',y),
 751     ('z',z),
 752     ('Z',ge_of_blige_etc),
 753     lex_filename="acapela.txt",
 754     lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: may be able to convert part-of-speech (NOUN etc) to/from some other formats e.g. Festival
 755     lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],
 756     # TODO: inline_format ?
 757     word_separator=" ",phoneme_separator="",
 758     safe_to_drop_characters=True, # TODO: really?
 759   ),
 760   "vocaloid" : makeVariantDic(
 761      "X-SAMPA phonemes for Yamaha's Vocaloid singing synthesizer.  Contributed by Lorenzo Gatti, who tested in Vocaloid 4 using two American English voices.",
 762      ('-',syllable_separator),
 763      (primary_stress,'',False), # not used by Vocaloid
 764      (secondary_stress,'',False),
 765      ('Q',a_as_in_ah),
 766      (var3_a_as_in_ah,'Q',False),
 767      (var4_a_as_in_ah,'Q',False),
 768      (var5_a_as_in_ah,'Q',False),
 769      ('O@',o_as_in_orange),
 770      (var1_o_as_in_orange,'O@',False),
 771      (var2_o_as_in_orange, 'O@',False),
 772      ('@U',o_as_in_now),
 773      ('@r',e_as_in_herd),
 774      (var1_eye, 'aI',False),
 775      ('e',e_as_in_them),
 776      ('I@',ar_as_in_year),
 777      ('e@',a_as_in_air),
 778      (var1_a_as_in_air, 'e@',False),
 779      (var2_a_as_in_air, 'e@',False),
 780      (var3_a_as_in_air, 'e@',False),
 781      (var4_a_as_in_air, 'e@',False),
 782      (var1_a_as_in_ate, 'eI', False),
 783      (var1_i_as_in_it, 'I',False),
 784      (var1_ear, 'I@',False),
 785      ('i:',e_as_in_eat),
 786      (var1_e_as_in_eat, 'i:',False),
 787      (var2_o_as_in_go, '@U', False),
 788      ('V', var1_u_as_in_but),
 789      (var1_oy_as_in_toy, 'OI',False),
 790      ('r',r),
 791      ('th',t),
 792      (var1_oor_as_in_poor, '@U',False),
 793      ('u:',oo_as_in_food),
 794      (var1_oo_as_in_food, 'u:',False),
 795      (var1_close_to_or,'O:',False),
 796      (var2_close_to_or,'O:',False),
 797      (var1_w, 'w', False),
 798      lex_filename="vocaloid.txt",
 799      phoneme_separator=" ",
 800      noInherit=True
 801   ),
 802   "android-pico" : makeVariantDic(
 803     'X-SAMPA phonemes for the default \"Pico\" voice in Android (1.6+, American), wrapped in Java code', # you could put en-GB instead of en-US, but it must be installed on the phone
 804     ('A:',a_as_in_ah), # won't sound without the :
 805     (var5_a_as_in_ah,'A:',False), # a: won't sound
 806     ('@U:',o_as_in_go),
 807     ('I',var1_i_as_in_it), # '1' won't sound
 808     ('i:',e_as_in_eat), # 'i' won't sound
 809     ('u:',oo_as_in_food), # }: won't sound
 810     ('a_I',eye),('a_U',o_as_in_now),('e_I',a_as_in_ate),('O_I',oy_as_in_toy),(var1_oy_as_in_toy,'O_I',False),('o_U',var2_o_as_in_go),
 811     cleanup_regexps=[(r'\\',r'\\\\'),('"','&quot;'),('::',':')],
 812     lex_filename="",lex_entry_format="",
 813     lex_read_function=None,
 814     inline_oneoff_header=r'class Speak { public static void speak(android.app.Activity a,String s) { class OnInit implements android.speech.tts.TextToSpeech.OnInitListener { public OnInit(String s) { this.s = s; } public void onInit(int i) { mTts.speak(this.s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private String s; }; if(mTts==null) mTts=new android.speech.tts.TextToSpeech(a,new OnInit(s),"com.svox.pico"); else mTts.speak(s, android.speech.tts.TextToSpeech.QUEUE_ADD, null); } private static android.speech.tts.TextToSpeech mTts = null; };'+'\n',
 815     inline_header=r'Speak.speak(this,"<speak xml:lang=\"en-US\">',
 816     inline_format=r'<phoneme alphabet=\"xsampa\" ph=\"%s\"/>',
 817     clause_separator=r".\n", # note r"\n" != "\n"
 818     inline_footer='</speak>");',
 819   ),
 820
 821   "acapela-uk" : makeDic(
 822     'Acapela-optimised X-SAMPA for UK English voices (e.g. "Peter"), contributed by Jan Weiss',
 823     ('.',syllable_separator),('"',primary_stress),('%',secondary_stress), # copied from "x-sampa", not tested
 824     ('A:',a_as_in_ah),
 825     ('{',a_as_in_apple),
 826     ('V',u_as_in_but),
 827     ('Q',o_as_in_orange),
 828     ('A',var1_o_as_in_orange),
 829     ('O',var2_o_as_in_orange),
 830     ('aU',o_as_in_now),
 831     ('{O',var1_o_as_in_now),
 832     ('@',a_as_in_ago),
 833     ('3:',e_as_in_herd),
 834     ('aI',eye),
 835     ('A e',var1_eye),
 836     ('b',b),
 837     ('t S',ch),
 838     ('d',d),
 839     ('D',th_as_in_them),
 840     ('e',e_as_in_them),
 841     (ar_as_in_year,'3:',False),
 842     ('e @',a_as_in_air),
 843     ('e r',var1_a_as_in_air),
 844     ('e :',var2_a_as_in_air),
 845     (var3_a_as_in_air,'e :',False),
 846     ('eI',a_as_in_ate),
 847     ('{I',var1_a_as_in_ate),
 848     ('f',f),
 849     ('g',g),
 850     ('h',h),
 851     ('I',i_as_in_it),
 852     ('1',var1_i_as_in_it),
 853     ('I@',ear),
 854     ('I r',var1_ear),
 855     ('i',e_as_in_eat),
 856     ('i:',var1_e_as_in_eat),
 857     ('dZ',j_as_in_jump),
 858     ('k',k),
 859     ('x',opt_scottish_loch),
 860     ('l',l),
 861     ('m',m),
 862     ('n',n),
 863     ('N',ng),
 864     ('@U',o_as_in_go),
 865     ('o U',var2_o_as_in_go),
 866     ('@ }',var1_u_as_in_but),
 867     ('OI',oy_as_in_toy),
 868     ('o I',var1_oy_as_in_toy),
 869     ('p',p),
 870     ('r',r),
 871     ('s',s),
 872     ('S',sh),
 873     ('t',t),
 874     ('T',th_as_in_think),
 875     ('U@',oor_as_in_poor),
 876     ('U r',var1_oor_as_in_poor),
 877     ('U',opt_u_as_in_pull),
 878     ('u:',oo_as_in_food),
 879     ('O:',close_to_or),
 880     (var1_close_to_or,'O',False),
 881     ('v',v),
 882     ('w',w),
 883     ('j',y),
 884     ('z',z),
 885     ('Z',ge_of_blige_etc),
 886     lex_filename="acapela.txt",
 887     lex_entry_format = "%s\t#%s\tUNKNOWN\n", # TODO: part-of-speech (as above)
 888     lex_read_function=lambda lexfile:[(word,pronunc.lstrip("#")) for word, pronunc, ignore in [l.split(None,2) for l in lexfile.readlines()]],
 889     inline_format = "\\Prn=%s\\",
 890     safe_to_drop_characters=True, # TODO: really?
 891   ),
 892
 893   "cmu" : makeDic(
 894     'format of the US-English Carnegie Mellon University Pronouncing Dictionary, contributed by Jan Weiss', # http://www.speech.cs.cmu.edu/cgi-bin/cmudict
 895     ('0',syllable_separator),
 896     ('1',primary_stress),
 897     ('2',secondary_stress),
 898     ('AA',a_as_in_ah),
 899     (var1_a_as_in_ah,'2',False),
 900     (ipa_colon,'1',False),
 901     ('AE',a_as_in_apple),
 902     ('AH',u_as_in_but),
 903     (o_as_in_orange,'AA',False),
 904     ('AW',o_as_in_now),
 905     (a_as_in_ago,'AH',False), # seems they don't use AX as festival-cmu does
 906     ('ER',e_as_in_herd), # TODO: check this one
 907     ('AY',eye),
 908     ('B',b),
 909     ('CH',ch),
 910     ('D',d),
 911     ('DH',th_as_in_them),
 912     ('EH',e_as_in_them),
 913     (ar_as_in_year,'ER',False),
 914     (a_as_in_air,'ER',False),
 915     ('EY',a_as_in_ate),
 916     ('F',f),
 917     ('G',g),
 918     ('HH',h),
 919     ('IH',i_as_in_it),
 920     ('EY AH',ear),
 921     ('IY',e_as_in_eat),
 922     ('JH',j_as_in_jump),
 923     ('K',k),
 924     ('L',l),
 925     ('M',m),
 926     ('N',n),
 927     ('NG',ng),
 928     ('OW',o_as_in_go),
 929     ('OY',oy_as_in_toy),
 930     ('P',p),
 931     ('R',r),
 932     ('S',s),
 933     ('SH',sh),
 934     ('T',t),
 935     ('TH',th_as_in_think),
 936     ('UH',oor_as_in_poor),
 937     ('UW',oo_as_in_food),
 938     ('AO',close_to_or),
 939     ('V',v),
 940     ('W',w),
 941     ('Y',y),
 942     ('Z',z),
 943     ('ZH',ge_of_blige_etc),
 944     # lex_filename not set (does CMU have a lex file?)
 945     safe_to_drop_characters=True, # TODO: really?
 946   ),
 947
 948   # BEGIN PRE-32bit ERA SYNTHS (TODO: add an attribute to JS-hide them by default in HTML?  what about the SpeakJet which probably isn't a 32-bit chip but is post 32-bit era?  and then what about the 'approximation' formats - kana etc - would they need hiding by default also?  maybe best to just leave it)
 949   "apollo" : makeDic(
 950     'Dolphin Apollo 2 serial-port and parallel-port hardware synthesizers (in case anybody still uses those)',
 951     (syllable_separator,'',False), # I don't think the Apollo had anything to mark stress; TODO: control the pitch instead like bbcmicro ?
 952     ('_QQ',syllable_separator,False), # a slight pause
 953     ('_AA',a_as_in_apple),
 954     ('_AI',a_as_in_ate),
 955     ('_AR',a_as_in_ah),
 956     ('_AW',close_to_or),
 957     ('_A',a_as_in_ago),
 958     ('_B',b),
 959     ('_CH',ch),
 960     ('_D',d),
 961     ('_DH',th_as_in_them),
 962     ('_EE',e_as_in_eat),
 963     ('_EI',a_as_in_air),
 964     ('_ER',e_as_in_herd),
 965     ('_E',e_as_in_them),
 966     ('_F',f),
 967     ('_G',g),
 968     ('_H',h),
 969     ('_IA',ear),
 970     ('_IE',eye),
 971     ('_I',i_as_in_it),
 972     ('_J',j_as_in_jump),
 973     ('_K',k),
 974     ('_KK',k,False), # sCHool
 975     ('_L',l),
 976     ('_M',m),
 977     ('_NG',ng),
 978     ('_N',n),
 979     ('_OA',o_as_in_go),
 980     ('_OO',opt_u_as_in_pull),
 981     ('_OR',var3_close_to_or),
 982     ('_OW',o_as_in_now),
 983     ('_OY',oy_as_in_toy),
 984     ('_O',o_as_in_orange),
 985     ('_P',p),
 986     ('_PP',p,False), # sPeech (a stronger P ?)
 987     # _Q = k w - done by cleanup_regexps below
 988     ('_R',r),
 989     ('_SH',sh),
 990     ('_S',s),
 991     ('_TH',th_as_in_think),
 992     ('_T',t), ('_TT',t,False),
 993     ('_UU',oo_as_in_food),
 994     ('_U',u_as_in_but),
 995     ('_V',v),
 996     ('_W',w),
 997     # _X = k s - done by cleanup_regexps below
 998     ('_Y',y),
 999     ('_ZH',ge_of_blige_etc),
1000     ('_Z',z),
1001     # lex_filename not set (the hardware doesn't have one; HAL has an "exceptions dictionary" but I don't know much about it)
1002     approximate_missing=True,
1003     safe_to_drop_characters=True, # TODO: really?
1004     word_separator=" ",phoneme_separator="",
1005     cleanup_regexps=[('_K_W','_Q'),('_K_S','_X')],
1006     cvtOut_regexps=[('_Q','_K_W'),('_X','_K_S')],
1007   ),
1008   "dectalk" : makeDic(
1009     'DECtalk hardware synthesizers (American English)', # (1984-ish serial port; later ISA cards)
1010     (syllable_separator,'',False),
1011     ("'",primary_stress),
1012     ('aa',o_as_in_orange),
1013     ('ae',a_as_in_apple),
1014     ('ah',u_as_in_but),
1015     ('ao',close_to_or), # bought
1016     ('aw',o_as_in_now),
1017     ('ax',a_as_in_ago),
1018     ('ay',eye),
1019     ('b',b),
1020     ('ch',ch),
1021     ('d',d), ('dx',d,False),
1022     ('dh',th_as_in_them),
1023     ('eh',e_as_in_them),
1024     ('el',l,False), # -le of bottle, allophone ?
1025     # TODO: en: -on of button (2 phonemes?)
1026     ('ey',a_as_in_ate),
1027     ('f',f),
1028     ('g',g),
1029     ('hx',h),
1030     ('ih',i_as_in_it), ('ix',i_as_in_it,False),
1031     ('iy',e_as_in_eat), ('q',e_as_in_eat,False),
1032     ('jh',j_as_in_jump),
1033     ('k',k),
1034     ('l',l), ('lx',l,False),
1035     ('m',m),
1036     ('n',n),
1037     ('nx',ng),
1038     ('ow',o_as_in_go),
1039     ('oy',oy_as_in_toy),
1040     ('p',p),
1041     ('r',r), ('rx',r,False),
1042     ('rr',e_as_in_herd),
1043     ('s',s),
1044     ('sh',sh),
1045     ('t',t), ('tx',t,False),
1046     ('th',th_as_in_think),
1047     ('uh',opt_u_as_in_pull),
1048     ('uw',oo_as_in_food),
1049     ('v',v),
1050     ('w',w),
1051     ('yx',y),
1052     ('z',z),
1053     ('zh',ge_of_blige_etc),
1054     ('ihr',ear), # DECtalk makes this from ih + r
1055     approximate_missing=True,
1056     cleanup_regexps=[('yxuw','yu')], # TODO: other allophones ("x',False" stuff above)?
1057     cvtOut_regexps=[('yu','yxuw')],
1058     # lex_filename not set (depends on which model etc)
1059     stress_comes_before_vowel=True,
1060     safe_to_drop_characters=True, # TODO: really?
1061     word_separator=" ",phoneme_separator="",
1062     inline_header="[:phoneme on]\n",
1063     inline_format="[%s]",
1064   ),
1065   "doubletalk" : makeDic(
1066     'DoubleTalk PC/LT serial-port hardware synthesizers (American English; assumes DOS driver by default, otherwise set DTALK_COMMAND_CODE to your current command-code binary value, e.g. export DTALK_COMMAND_CODE=1)', # (1 is the synth's default; the DOS driver lets you put * instead)
1067     (syllable_separator,'',False),
1068     ("/",primary_stress), # TODO: check it doesn't need a balancing \ afterwards (docs do say it's a "temporary" change of pitch, but it's unclear how long a 'temporary')
1069     ('M',m),('N',n),('NX',ng),('O',o_as_in_go),
1070     ('OW',o_as_in_go,False), # allophone
1071     (o_as_in_orange,'O',False), # TODO: is this the best approximation we can do?
1072     ('OY',oy_as_in_toy),('P',p),
1073     ('R',r),('S',s),('SH',sh),('T',t),
1074     ('TH',th_as_in_think),('V',v),('W',w),('Z',z),
1075     ('ZH',ge_of_blige_etc),('K',k),('L',l),
1076     ('PX',p,False), ('TX',t,False), # aspirated allophones
1077     ('WH',w,False), ('KX',k,False), # ditto
1078     ('YY',y),('Y',y,False),
1079     ('UH',opt_u_as_in_pull),('UW',oo_as_in_food),
1080     ('AA',a_as_in_ah),('AE',a_as_in_apple),
1081     ('AH',u_as_in_but),('AO',close_to_or),
1082     ('AW',o_as_in_now),('AX',a_as_in_ago),
1083     ('AY',eye),('B',b),('CH',ch),('D',d),
1084     ('DH',th_as_in_them),
1085     ('DX',t,False), # an American "d"-like "t"
1086     ('EH',e_as_in_them),('ER',e_as_in_herd),
1087     ('EY',a_as_in_ate),('F',f),('G',g),('H',h),
1088     ('IH',i_as_in_it),('IX',i_as_in_it,False),
1089     ('IY',e_as_in_eat),('JH',j_as_in_jump),
1090     approximate_missing=True,
1091     stress_comes_before_vowel=True,
1092     inline_format=markup_doubleTalk_word,
1093     format_is_binary=ifset('DTALK_COMMAND_CODE',True),
1094     # DoubleTalk does have a loadable "exceptions dictionary" but usually relies on a DOS tool to write it; I don't have the documentation about it (and don't know how much RAM is available for it - it's taken from the input buffer)
1095   ),
1096   "keynote" : makeDic(
1097     'Phoneme-read and lexicon-add codes for Keynote Gold hardware synthesizers (American English)', # ISA, PCMCIA, serial, etc; non-serial models give you an INT 2Fh param to get the address of an API function to call; not sure which software can send these codes directly to it)
1098     (syllable_separator,'',False),
1099     (primary_stress,"'"),(secondary_stress,'"'),
1100     ('w',w),('y',y),('h',h),('m',m),('n',n),('ng',ng),
1101     ('l',l),('r',r),('f',f),('v',v),('s',s),('z',z),
1102     ('th',th_as_in_think),('dh',th_as_in_them),('k',k),
1103     ('ch',ch),('zh',ge_of_blige_etc),('sh',sh),('g',g),
1104     ('jh',j_as_in_jump),('b',b),('p',p),('d',d),('t',t),
1105     ('i',e_as_in_eat),('I',i_as_in_it),
1106     ('e',a_as_in_ate),('E',e_as_in_them),
1107     ('ae',a_as_in_apple),('u',oo_as_in_food),
1108     ('U',opt_u_as_in_pull),('o',o_as_in_go),
1109     ('O',close_to_or),('a',o_as_in_orange),
1110     ('^',u_as_in_but),('R',e_as_in_herd),
1111     ('ay',eye),('Oy',oy_as_in_toy),('aw',o_as_in_now),
1112     ('=',a_as_in_ago),
1113     approximate_missing=True,
1114     inline_format="[p]%s[t]",
1115     lex_filename="keynote.dat", # you have to somehow get this directly dumped to the card, see comment above
1116     lex_entry_format="[x]%s %s", lex_footer="[t]\n",
1117     stress_comes_before_vowel=False, # even though it's "'"
1118   ),
1119   "audapter" : makeVariantDic(
1120   "Audapter Speech System, an old hardware serial/parallel-port synthesizer (American English)", # 1989 I think.  The phonemes themselves are the same as the Keynote above, but there's an extra binary byte in the commands and the lex format is stricter.  I haven't checked but my guess is Audapter came before Keynote.
1121   inline_format='\x05[p] %s\x05[t]',
1122   format_is_binary=True,
1123   lex_filename="audapter.dat",
1124   lex_entry_format="\x05[x]%s %s\x05[t]\n", lex_footer="",
1125   ),
1126   "bbcmicro" : makeDic(
1127     "BBC Micro Speech program from 1985 (see comments in lexconvert.py for more details)",
1128     # Speech was written by David J. Hoskins and published by Superior Software.  It took 7.5k of RAM including 3.1k of samples (49 phonemes + 1 for fricatives at 64 bytes each, 4-bit ~5.5kHz), 2.2k of lexicon, and 2.2k of machine code; sounds "retro" by modern standards but quite impressive for the BBC Micro in 1985.  Samples are played by amplitude-modulating the BBC's tone generator.
1129     # If you use an emulator like BeebEm, you'll need diskimg/Speech.ssd.  This can be made from your original Speech disc, or you might be able to find one but beware of copyright!  Same goes with the ROM images included in BeebEm (you might want to delete ones you didn't have).  There has been considerable discussion over whether UK copyright law does or should allow "format-shifting" your own legally-purchased media, and I don't fully understand all the discussion so I don't want to give advice on it here.  The issue is "format-shifting" your legally-purchased BBC Micro ROM code and Speech disc to emulator images; IF this is all right then I suspect downloading someone else's copy is arguably allowed as long as you bought it legally "back in the day", but I'm not a solicitor so I don't know.
1130     # (Incidentally, yes I was the Silas Brown referred to in Beebug 11.1 p.59, 11.9 p.50/11.10 p.47 and 12.10 p.24, and, no, the question in the final issue wasn't quite how I put it, but all taken in good humour.)
1131     # lexconvert's --phones bbcmicro option creates *SPEAK commands which you can type into the BBC Micro or paste into an emulator, either at the BASIC prompt, or in a listing with line numbers provided by AUTO.  You have to load the Speech program first of course.
1132     # To script this on BeebEm, first turn off the Speech disc's boot option (by turning off File / Disc options / Write protect and entering "*OPT 4,0"; use "*OPT 4,3" if you want it back later; if you prefer to edit the disk image outside of the emulator then change byte 0x106 from 0x33 to 0x03), and then you can do (e.g. on a Mac) open /usr/local/BeebEm3/diskimg/Speech.ssd && sleep 1 && (echo '*SPEECH';python lexconvert.py --phones bbcmicro "Greetings from 19 85") | pbcopy && osascript -e 'tell application "System Events" to keystroke "v" using command down'
1133     # or if you know it's already loaded: echo "Here is some text" | python lexconvert.py --phones bbcmicro | pbcopy && osascript -e 'tell application "BeebEm3" to activate' && osascript -e 'tell application "System Events" to keystroke "v" using command down'
1134     # (unfortunately there doesn't seem to be a way of doing it without giving the emulator window focus)
1135     # If you want to emulate a Master, you might need a *DISK before the *SPEECH (to take it out of ADFS mode).
1136     # You can also put Speech into ROM, but this can cause problems: see comments on SP8000 later.
1137     (syllable_separator,'',False),
1138     ('4',primary_stress),
1139     ('5',secondary_stress), # (these are pitch numbers on the BBC; normal pitch is 6, and lower numbers are higher pitches, so try 5=secondary and 4=primary; 3 sounds less calm)
1140     ('AA',a_as_in_ah),
1141     ('AE',a_as_in_apple),
1142     ('AH',u_as_in_but),
1143     ('O',o_as_in_orange),
1144     ('AW',o_as_in_now),
1145     (a_as_in_ago,'AH',False),
1146     ('ER',e_as_in_herd),
1147     ('IY',eye),
1148     ('B',b),
1149     ('CH',ch),
1150     ('D',d),
1151     ('DH',th_as_in_them),
1152     ('EH',e_as_in_them),
1153     (ar_as_in_year,'ER',False),
1154     ('AI',a_as_in_air),
1155     ('AY',a_as_in_ate),
1156     ('F',f),
1157     ('G',g),
1158     ('/H',h),
1159     ('IH',i_as_in_it),
1160     ('IX',var2_i_as_in_it), # (IX sounds to me like a slightly shorter version of IH)
1161     ('IXAH',ear),
1162     ('EER',var2_ear), # e.g. 'hear', 'near' - near enough
1163     ('EE',e_as_in_eat),
1164     ('J',j_as_in_jump),
1165     ('K',k),
1166     ('C',k,False), # for CT as in "fact", read out as K+T
1167     ('L',l),
1168     ('M',m),
1169     ('N',n),
1170     ('NX',ng),
1171     ('OW',o_as_in_go),
1172     ('OL',opt_ol_as_in_gold), # (if dest format doesn't have this, it'll get o_as_in_orange from the O, then the l)
1173     ('OY',oy_as_in_toy),
1174     ('P',p),
1175     ('R',r),
1176     ('S',s),
1177     ('SH',sh),
1178     ('T',t),
1179     ('TH',th_as_in_think),
1180     ('AOR',oor_as_in_poor),
1181     ('UH',oor_as_in_poor,False), # TODO: really? (espeak 'U' goes to opt_u_as_in_pull, and eSpeak also used U for the o in good, which sounds best with Speech's default UH4, hence the line below, but where did we get UH->oor_as_in_poor from?  Low-priority though because how often do you convert OUT of bbcmicro format)
1182     (opt_u_as_in_pull,'UH',False),
1183     ('/U',opt_u_as_in_pull,False),
1184     ('/UL',opt_ul_as_in_pull), # if dest format doesn't have this, it'll get opt_u_as_in_pull from the /U, then l
1185     ('UW',oo_as_in_food),
1186     ('UX',oo_as_in_food,False),
1187     ('AO',close_to_or),
1188     ('V',v),
1189     ('W',w),
1190     ('Y',y),
1191     ('Z',z),
1192     ('ZH',ge_of_blige_etc),
1193     lex_filename=ifset("MAKE_SPEECH_ROM","SPEECH.ROM","BBCLEX"),
1194     lex_entry_format=as_utf8("> %s_")+chr(128)+as_utf8("%s"), # (specifying 'whole word' for now; remove the space before or the _ after if you want)
1195     lex_read_function = lambda lexfile: [(w[0].lstrip().rstrip('_').lower(),w[1]) for w in filter(lambda x:len(x)==2,[w.split(chr(128)) for w in getBuf(lexfile).read().split('>')])], # TODO: this reads back the entries we generate, but is unlikely to work well with the wildcards in the default lexicon that would have been added if SPEECH_DISK was set (c.f. trying to read eSpeak's en_rules instead of en_extra)
1196     lex_word_case = "upper",
1197     lex_header = bbc_prepDefaultLex,
1198     lex_footer = bbc_appendDefaultLex, # + ">**"
1199     inline_format = markup_bbcMicro_word,
1200     word_separator=" ",phoneme_separator="",
1201     clause_separator=write_bbcmicro_phones, # special case
1202     safe_to_drop_characters=True, # TODO: really?
1203     cleanup_regexps=[
1204       ('KT','CT'), # Speech instructions: "CT as in fact"
1205       ('DYUW','DUX'), # "DUX as in duke"
1206       ('AHR$','AH'), # usually sounds a bit better
1207     ],
1208     cvtOut_regexps=[('DUX','DYUW')], # CT handled above
1209   ),
1210   "bbcmicro-cc" : makeDic(
1211      "Computer Concepts Speech ROM which provided phonemes for the BBC Micro's TMS5220 \"speech chip\" add-on (less widely sold than the software-only product)", # (and harder to run on an emulator.  It wasn't the only phoneme ROM, e.g. Easytalk Speech Utility ROM by Galaxy, reviewed in Beebug Jan/Feb 1985 (3.8) p.32, expanded on Acorn's original PHROM with commands like *SAY Y.U:N.I.V.ER.S but we don't know all the phonemes; there were also some allophone-based hardware boards)
1212      (syllable_separator,"",False),
1213      ('*',primary_stress),('+',secondary_stress),
1214      ('E',e_as_in_eat),('i',i_as_in_it),('e',e_as_in_them),
1215      ('a',a_as_in_apple),('u',u_as_in_but),('AR',a_as_in_ah),
1216      ('o',o_as_in_orange),('OR',close_to_or),('oo',opt_u_as_in_pull),
1217      ('OO',oo_as_in_food),('ER',e_as_in_herd),('A',a_as_in_ate),
1218      ('I',eye),('O',o_as_in_go),('OY',oy_as_in_toy),
1219      ('AW',o_as_in_now),('EA',ear),('ea',a_as_in_air),
1220      ('UR',oor_as_in_poor),('UH',a_as_in_ago),
1221      ('P',p),('B',b),('T',t),
1222      ('D',d),('K',k),('G',g),
1223      ('CH',ch),('J',j_as_in_jump),('F',f),
1224      ('V',v),('TH',th_as_in_think),('DH',th_as_in_them),
1225      ('S',s),('Z',z),('SH',sh),
1226      ('ZH',ge_of_blige_etc),('H',h),('M',m),
1227      ('N',n),('NG',ng),('L',l),
1228      ('R',r),('Y',y),('W',w),
1229      stress_comes_before_vowel=True,
1230      inline_header="*UTTER <1> ",
1231      clause_separator="\n*UTTER <1> ", # TODO: manual does not say what the maximum length is; longest parameter in examples is 80 bytes; should we use inline_format to make each WORD a separate command?
1232      cleanup_regexps=[('[*] ','*'),('[+] ','+')],
1233      safe_to_drop_characters=' ',
1234   ),
1235
1236   "amiga" : makeDic(
1237     'AmigaOS speech synthesizer (American English)', # shipped with the 1985 Amiga release; developed by SoftVoice Inc
1238     # All I had to go by for this was a screenshot on Marcos Miranda's "blog".  I once saw this synth demonstrated but never tried it.  My early background was the BBC Micro, not Amigas etc.  But I know some people are keen on Amigas so I might as well include it.
1239     # (By the way I think David Hoskins had it harder than SoftVoice.  Yes they were both in 1985, but the Amiga was a new 16-bit machine while the BBC was an older 8-bit one.  See the "sam" format for an even older one though, although probably not written by one person.)
1240     (syllable_separator,'',False),
1241     ('4',primary_stress),('3',secondary_stress),
1242     ('/H',h),
1243     ('EH',e_as_in_them),
1244     ('L',l),
1245     ('OW',o_as_in_go),
1246     ('AY',eye),
1247     ('AE',a_as_in_apple),
1248     ('M',m),
1249     ('DH',th_as_in_them),
1250     ('IY',e_as_in_eat),
1251     ('AH',a_as_in_ago),
1252     ('G',g),
1253     ('K',k),
1254     ('U',u_as_in_but),
1255     ('P',p),
1256     ('Y',y),
1257     ('UW',oo_as_in_food),
1258     ('T',t),
1259     ('ER',var1_a_as_in_ago),
1260     ('IH',i_as_in_it),
1261     ('S',s),
1262     ('Z',z),
1263     ('AW',o_as_in_now),
1264     ('AA',a_as_in_ah),
1265     ('R',r),
1266     ('D',d),('F',f),('N',n),('NX',ng),('J',j_as_in_jump),
1267     ('B',b),('V',v),('TH',th_as_in_think),
1268     ('OH',close_to_or),('EY',a_as_in_ate),
1269     # The following consonants were not on the screenshot
1270     # (or at least I couldn't find them) so I'm guessing.
1271     # I think this should work given the way the other
1272     # consonants work in this table.
1273     ('W',w),('CH',ch),('SH',sh),
1274     # The following vowels were not in the screenshot and
1275     # we just have to hope this guess is right - when
1276     # someone tries it on an Amiga and says it doesn't
1277     # work, maybe we can update this....
1278     ('O',o_as_in_orange),('OY',oy_as_in_toy),
1279     # and these ones we can approximate to ones we already know (given that we're having to approximate British to an American voice anyway, it can't hurt TOO much more)
1280      (a_as_in_air,'EH',False),
1281      (e_as_in_herd,'ER',False),
1282      (ar_as_in_year,'ER',False),
1283      (ear,'IYAH',False), # or try IYER, or there might be a phoneme for it
1284      (ge_of_blige_etc,'J',False),
1285      (oor_as_in_poor,'OH',False),
1286     # lex_filename not set (I have no idea how the Amiga lexicon worked)
1287     safe_to_drop_characters=True, # TODO: really?
1288     word_separator=" ",phoneme_separator="",
1289   ),
1290   "sam" : makeDic(
1291   'Software Automatic Mouth (1982 American English synth that ran on C64, Atari 400/800/etc and Apple II/etc)', # *might* be similar to Macintalk on the 1st Macintosh in 1984
1292   (syllable_separator,'',False),
1293   (primary_stress,'4'),
1294   (secondary_stress,'5'),
1295   ('IY',e_as_in_eat),
1296   ('IH',i_as_in_it),
1297   ('EH',e_as_in_them),
1298   ('AE',a_as_in_apple),
1299   ('AA',o_as_in_orange),
1300   ('AH',u_as_in_but),
1301   ('AO',close_to_or),
1302   ('OH',o_as_in_go),
1303   ('UH',opt_u_as_in_pull),
1304   ('UX',oo_as_in_food),
1305   ('ER',e_as_in_herd),
1306   ('AX',a_as_in_apple,False), # allophone?
1307   ('IX',i_as_in_it,False), # allophone?
1308   ('EY',a_as_in_ate),
1309   ('AY',eye),('OY',oy_as_in_toy),
1310   ('AW',o_as_in_now),('OW',o_as_in_go,False),
1311   ('UW',oo_as_in_food,False), # allophone?
1312   ('R',r),('L',l),('W',w),('WH',w,False),('Y',y),('M',m),
1313   ('N',n),('NX',ng),('B',b),('D',d),('G',g),('Z',z),
1314   ('J',j_as_in_jump),('ZH',ge_of_blige_etc),('V',v),
1315   ('DH',th_as_in_them),('S',s),('SH',sh),('F',f),
1316   ('TH',th_as_in_think),('P',p),('T',t),('K',k),
1317   ('CH',ch),('/H',h),('Q',glottal_stop),
1318   approximate_missing=True,
1319   word_separator=" ",phoneme_separator="",
1320   # TODO: inline_format etc similar to bbcmicro?
1321   # In Atari BASIC, you set SAM$ to the phonemes and then
1322   # do A=USR(8192).  I don't know about the C64 etc versions.
1323   # (max 255 phonemes per string; don't know max line len.)
1324   ),
1325
1326   "cheetah" : makeDic(
1327      'Allophone codes for the 1983 "Cheetah Sweet Talker" SP0256-based hardware add-on for ZX Spectrum and BBC Micro home computers. The conversion from phonemes to allophones might need tweaking.',
1328      (syllable_separator,'',False),
1329      ("0",syllable_separator,False),
1330      ("1",syllable_separator,False),
1331      ("2",syllable_separator,False),
1332      ("3",syllable_separator,False),
1333      ("4",syllable_separator,False),
1334      ("5",oy_as_in_toy),
1335      ("6",eye),
1336      ("7",e_as_in_them),
1337      ("8",k,False),
1338      ("9",p),
1339      ("10",j_as_in_jump),
1340      ("11",n),
1341      ("12",i_as_in_it),
1342      ("13",t),
1343      ("14",r),
1344      ("15",u_as_in_but),
1345      ("16",m),
1346      ("17",t,False),
1347      ("18",th_as_in_them),
1348      ("19",e_as_in_eat),
1349      ("20",a_as_in_ate),
1350      ("21",d),
1351      ("22",oo_as_in_food),
1352      ("23",close_to_or),
1353      ("24",o_as_in_orange),
1354      ("25",y),
1355      ("26",a_as_in_apple),
1356      ("27",h),
1357      ("28",b),
1358      ("29",th_as_in_think),
1359      (opt_u_as_in_pull,"30",False),
1360      ("30",opt_ul_as_in_pull),
1361      ("31",oo_as_in_food,False),
1362      ("32",o_as_in_now),
1363      ("33",d,False),
1364      ("34",g,False),
1365      ("35",v),
1366      ("36",g),
1367      ("37",sh),
1368      ("38",ge_of_blige_etc),
1369      ("39",r,False),
1370      ("40",f),
1371      ("41",k),
1372      ("42",k,False),
1373      ("43",z),
1374      ("44",ng),
1375      ("45",l),
1376      ("46",w),
1377      ("47",a_as_in_air),
1378      ("49",y,False),
1379      ("50",ch),
1380      ("51",a_as_in_ago),
1381      ("52",e_as_in_herd),
1382      (var1_a_as_in_ago,"52",False),
1383      ("53",o_as_in_go),
1384      ("54",th_as_in_them,False),
1385      ("55",s),
1386      ("56",n,False),
1387      ("57",h,False),
1388      ("58",var3_close_to_or),
1389      ("59",a_as_in_ah),
1390      ("60",ear), # or var2_ear
1391      ("61",g,False),
1392      ("62",l,False),
1393      ("63",b,False),
1394      approximate_missing=True,
1395      phoneme_separator=',',safe_to_drop_characters=",",
1396      inline_header="DATA ",inline_footer=",0"),
1397
1398   # END (?) PRE-32bit ERA SYNTHS (but see TODO above re SpeakJet, which is below)
1399
1400   "speakjet" : makeDic(
1401     'Allophone codes for the American English "SpeakJet" speech synthesis chip (the conversion from phonemes to allophones might need tweaking).  Set the SPEAKJET_SYM environment variable to use mnemonics, otherwise numbers are used (set SPEAKJET_BINARY for binary output).',
1402   # TODO: might want to do something similar for the older Votrax SC-02 chip, but would need to check how exactly its phoneme interface was exposed to software by the PC cards that used it (Heathkit HV-2000 etc; not sure if any are still in use though)
1403     (syllable_separator,'',False), # TODO: instead of having emphasis, the Speakjet has a 'faster' code for all NON-emphasized syllables
1404     (speakjet('IY',128),e_as_in_eat),
1405     (speakjet('IH',129),i_as_in_it),
1406     (speakjet('EY',130),a_as_in_ate),
1407     (speakjet('EH',131),e_as_in_them),
1408     (speakjet('AY',132),a_as_in_apple),
1409     (speakjet('AX',133),a_as_in_ago),
1410     (speakjet('UX',134),u_as_in_but),
1411     (speakjet('OH',135),o_as_in_orange),
1412     (speakjet('AW',136),a_as_in_ah),
1413     (speakjet('OW',137),o_as_in_go),
1414     (speakjet('UH',138),opt_u_as_in_pull),
1415     (speakjet('UW',139),oo_as_in_food),
1416     (speakjet('MM',140),m),
1417     (speakjet('NE',141),n,False),
1418     (speakjet('NO',142),n),
1419     (speakjet('NGE',143),ng,False),
1420     (speakjet('NGO',144),ng),
1421     (speakjet('LE',145),l,False),
1422     (speakjet('LO',146),l),
1423     (speakjet('WW',147),w),
1424     (speakjet('RR',148),r),
1425     (speakjet('IYRR',149),ear),
1426     (speakjet('EYRR',150),a_as_in_air),
1427     (speakjet('AXRR',151),e_as_in_herd),
1428     (speakjet('AWRR',152),a_as_in_ah,False),
1429     (speakjet('OWRR',153),close_to_or),
1430     (speakjet('EYIY',154),a_as_in_ate,False),
1431     (speakjet('OHIY',155),eye),
1432     (speakjet('OWIY',156),oy_as_in_toy),
1433     (speakjet('OHIH',157),eye,False),
1434     (speakjet('IYEH',158),y),
1435     (speakjet('EHLL',159),l,False),
1436     (speakjet('IYUW',160),oo_as_in_food,False),
1437     (speakjet('AXUW',161),o_as_in_now),
1438     (speakjet('IHUW',162),oo_as_in_food,False),
1439     # TODO: 163 AYWW = o_as_in_now a_as_in_ago ? handle in cleanup_regexps + cvtOut_regexps ?
1440     (speakjet('OWWW',164),o_as_in_go,False),
1441     (speakjet('JH',165),j_as_in_jump),
1442     (speakjet('VV',166),v),
1443     (speakjet('ZZ',167),z),
1444     (speakjet('ZH',168),ge_of_blige_etc),
1445     (speakjet('DH',169),th_as_in_them),
1446     # TODO: get cleanup_regexps to clean up some of these according to what's coming next etc:
1447     (speakjet('BE',170),b,False),
1448     (speakjet('BO',171),b),
1449     (speakjet('EB',172),b,False),
1450     (speakjet('OB',173),b,False),
1451     (speakjet('DE',174),d,False),
1452     (speakjet('DO',175),d),
1453     (speakjet('ED',176),d,False),
1454     (speakjet('OD',177),d,False),
1455     (speakjet('GE',178),g,False),
1456     (speakjet('GO',179),g),
1457     (speakjet('EG',180),g,False),
1458     (speakjet('OG',181),g,False),
1459     (speakjet('CH',182),ch),
1460     (speakjet('HE',183),h,False),
1461     (speakjet('HO',184),h),
1462     (speakjet('WH',185),w,False),
1463     (speakjet('FF',186),f),
1464     (speakjet('SE',187),s,False),
1465     (speakjet('SO',188),s),
1466     (speakjet('SH',189),sh),
1467     (speakjet('TH',190),th_as_in_think),
1468     (speakjet('TT',191),t),
1469     (speakjet('TU',192),t,False),
1470     # TODO: 193 TS in cleanup_regexps and cvtOut_regexps
1471     (speakjet('KE',194),k,False),
1472     (speakjet('KO',195),k),
1473     (speakjet('EK',196),k,False),
1474     (speakjet('OK',197),k,False),
1475     (speakjet('PE',198),p,False),
1476     (speakjet('PO',199),p),
1477     # lex_filename not set (I think the front-end software might have one, but don't know if it's accessible; chip itself just takes phonemes)
1478     approximate_missing=True,
1479     word_separator=ifset('SPEAKJET_BINARY',""," "),
1480     phoneme_separator=ifset('SPEAKJET_BINARY',""," "),
1481     clause_separator=ifset('SPEAKJET_BINARY',"","\n"), # TODO: is there a pause code?
1482     output_is_binary=ifset('SPEAKJET_BINARY',True),
1483     safe_to_drop_characters=True, # TODO: really?
1484   ),
1485
1486   "rsynth" : makeDic(
1487     'rsynth text-to-speech C library (American English)', # TODO: test
1488     (syllable_separator,'',False), # TODO: emphasis?
1489     ("i:",e_as_in_eat),
1490     ("I",i_as_in_it),
1491     ("eI",a_as_in_ate),
1492     ("E",e_as_in_them),
1493     ("{",a_as_in_apple),
1494     ("V",u_as_in_but),
1495     ("Q",o_as_in_orange),
1496     ("A:",a_as_in_ah),
1497     ("oU",o_as_in_go),
1498     ("U",opt_u_as_in_pull),
1499     ("u:",oo_as_in_food),
1500     ("m",m),
1501     ("n",n),
1502     ("N",ng),
1503     ("l",l),
1504     ("w",w),
1505     ("r",r),
1506     ("I@",ear),
1507     ("e@",a_as_in_air),
1508     ("3:",e_as_in_herd),
1509     ("Qr",close_to_or),
1510     ("OI",oy_as_in_toy),
1511     ("aI",eye),
1512     ("j",y),
1513     ("U@",oo_as_in_food,False),
1514     ("aU",o_as_in_now),
1515     ("@U",o_as_in_go,False),
1516     ("dZ",j_as_in_jump),
1517     ("v",v),
1518     ("z",z),
1519     ("Z",ge_of_blige_etc),
1520     ("D",th_as_in_them),
1521     ("b",b),
1522     ("d",d),
1523     ("g",g),
1524     ("tS",ch),
1525     ("h",h),
1526     ("f",f),
1527     ("s",s),
1528     ("S",sh),
1529     ("T",th_as_in_think),
1530     ("t",t),
1531     ("k",k),
1532     ("p",p),
1533     approximate_missing=True,
1534     # lex_filename not set (TODO: check what sort of lexicon is used by rsynth's "say" front-end)
1535     safe_to_drop_characters=True, # TODO: really?
1536     word_separator=" ",phoneme_separator="",
1537   ),
1538
1539   "unicode-ipa" : makeDic(
1540     "IPA symbols in Unicode, as used by an increasing number of dictionary programs, websites etc",
1541     ('.',syllable_separator,False),
1542     (syllable_separator,'',False),
1543     (u'\u02c8',primary_stress),
1544     (u'\u02cc',secondary_stress),
1545     # NB the above two are "modifier", not "combining",
1546     # Unicode characters.  There IS a difference.  If
1547     # your software displays them as overprinting the
1548     # surrounding letters, you have a bug.
1549     # (E.g. WeChat v1.2.2.1 on Mac OS 10.7)
1550     ('#',text_sharp),
1551     ('_',text_underline),
1552     ('?',text_question),
1553     ('!',text_exclamation),
1554     (',',text_comma),
1555     (u'\u0251',a_as_in_ah),
1556     (u'\u02d0',ipa_colon),
1557     (u'\u0251\u02d0',var3_a_as_in_ah),
1558     (u'\u0251\u0279',var4_a_as_in_ah),
1559     (u'a\u02d0',var5_a_as_in_ah),
1560     (u'\xe6',a_as_in_apple),
1561     ('a',a_as_in_apple,False),
1562     (u'\u028c',u_as_in_but),
1563     ('\u1d27',u_as_in_but,False), # 28c sometimes mistakenly written as 1d27
1564     (u'\u0252',o_as_in_orange),
1565     (var1_o_as_in_orange,u'\u0251',False),
1566     (u'\u0254',var2_o_as_in_orange),
1567     (u'a\u028a',o_as_in_now),
1568     (u'\xe6\u0254',var1_o_as_in_now),
1569     (u'\u0259',a_as_in_ago),
1570     (u'\u0259\u02d0',e_as_in_herd),
1571     (u'\u025a',var1_a_as_in_ago),
1572     (u'a\u026a',eye), (u'\u028c\u026a',eye,False),
1573     (u'\u0251e',var1_eye),
1574     ('b',b),
1575     (u't\u0283',ch),
1576     (u'\u02a7',ch,False),
1577     ('d',d),
1578     (u'\xf0',th_as_in_them),
1579     (u'\u025b',e_as_in_them),
1580     ('e',var1_e_as_in_them),
1581     (u'\u025d',ar_as_in_year),
1582     (u'\u025c\u02d0',ar_as_in_year,False),
1583     (u'\u025b\u0259',a_as_in_air),
1584     (u'\u025b\u0279',var1_a_as_in_air),
1585     (u'e\u02d0',var2_a_as_in_air),
1586     (u'\u025b\u02d0',var3_a_as_in_air),
1587     (u'e\u0259',var4_a_as_in_air),
1588     (u'e\u026a',a_as_in_ate),
1589     (u'\xe6\u026a',var1_a_as_in_ate),
1590     ('f',f),
1591     (u'\u0261',g), ('g',g,False),
1592     ('h',h),
1593     (u'\u026a',i_as_in_it),
1594     (u'\u0268',var1_i_as_in_it),
1595     (u'\u026a\u0259',ear),
1596     (u'\u026a\u0279',var1_ear),
1597     (u'\u026a\u0279\u0259',var2_ear), # ?
1598     ('i',e_as_in_eat),
1599     (u'i\u02d0',var1_e_as_in_eat),
1600     (u'd\u0292',j_as_in_jump),
1601     (u'\u02a4',j_as_in_jump,False),
1602     ('k',k),
1603     ('x',opt_scottish_loch),
1604     ('l',l),
1605     (u'd\u026b',var1_l),
1606     ('m',m),
1607     ('n',n),
1608     (u'\u014b',ng),
1609     (u'\u0259\u028a',o_as_in_go),
1610     ('o',var1_o_as_in_go),
1611     (u'o\u028a',var2_o_as_in_go),
1612     (u'\u0259\u0289',var1_u_as_in_but),
1613     (u'\u0254\u026a',oy_as_in_toy),
1614     (u'o\u026a',var1_oy_as_in_toy),
1615     ('p',p),
1616     (u'\u0279',r), ('r',r,False),
1617     (var1_r,'r',False),
1618     ('s',s),
1619     (u'\u0283',sh),
1620     ('t',t),
1621     (u'\u027e',var1_t),
1622     (u'\u03b8',th_as_in_think),
1623     (u'\u028a\u0259',oor_as_in_poor),
1624     (u'\u028a\u0279',var1_oor_as_in_poor),
1625     (u'\u028a',opt_u_as_in_pull),
1626     (u'\u0289\u02d0',oo_as_in_food),
1627     (u'u\u02d0',var1_oo_as_in_food),
1628     ('u',var2_oo_as_in_food),
1629     (u'\u0254\u02d0',close_to_or),
1630     (var1_close_to_or,u'\u0254',False),
1631     (u'o\u02d0',var2_close_to_or),
1632     ('v',v),
1633     ('w',w),
1634     (u'\u028d',var1_w),
1635     ('j',y),
1636     ('z',z),
1637     (u'\u0292',ge_of_blige_etc),
1638     (u'\u0294',glottal_stop),
1639     lex_filename="words-ipa.html", # write-only for now
1640     lex_type = "HTML",
1641     lex_header = '<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head><body><table>',
1642     lex_entry_format="<tr><td>%s</td><td>%s</td></tr>\n",
1643     lex_footer = "</table></body></html>\n",
1644     word_separator=" ",phoneme_separator="",
1645     stress_comes_before_vowel=True,
1646     safe_to_drop_characters=True, # TODO: really? (at least '-' should be safe to drop)
1647     cvtOut_func=unicode_preprocess,
1648   ),
1649
1650   "unicode-ipa-syls" : makeVariantDic(
1651   "Like unicode-ipa but with syllable separators preserved",
1652   (syllable_separator,'.'),
1653   cleanup_regexps=[(r"\.+",".")], # multiple . to one .
1654   noInherit=True),
1655
1656   "yinghan" : makeVariantDic(
1657      "As unicode-ipa but, when converting a user lexicon, generates Python code that reads Wenlin Yinghan dictionary entries and adds IPA bands to matching words",
1658     lex_filename="yinghan-ipa.py", # write-only for now
1659     lex_type = "Python script",
1660     lex_header = r"""#!/usr/bin/env python
1661 # -*- coding: utf-8 -*-
1662
1663 # Works in both Python 2 and Python 3
1664
1665 import sys; d={""",
1666     lex_entry_format='u"%s":u"%s",\n',
1667     lex_footer = r"""}
1668 import re
1669 try: i,o=sys.stdin.buffer,sys.stdout.buffer # Python 3
1670 except AttributeError: i,o=sys.stdin,sys.stdout # Python 2
1671 for k in list(d.keys()): d[k.lower().encode('utf-8')]=d[k]
1672 nextIsHead=False
1673 for l in i:
1674  o.write(l)
1675  if nextIsHead and l.strip():
1676   w=l.split()
1677   if w[0]==u'ehw'.encode('utf-8'): l=u' '.encode('utf-8').join(w[1:])
1678   k = re.sub(u'\\([^)]*\\)$'.encode('utf-8'),u''.encode('utf-8'),l.strip()).strip().lower() # (allow parenthesised explanation after headword when matching)
1679   if k in d: o.write(u'ipa '.encode('utf-8')+d[k].encode('utf-8')+u'\n'.encode('utf-8'))
1680  if l.startswith(u'*** '.encode('utf-8')): nextIsHead=True
1681 """,
1682     noInherit=True
1683   ),
1684
1685   "unicode-rough" : makeVariantDic(
1686     "A non-standard notation that's reminiscent of unicode-ipa but changed so that more of the characters show in old browsers with incomplete fonts",
1687     ("'",primary_stress),
1688     (',',secondary_stress),
1689     ('ar-',a_as_in_ah),
1690     (':',ipa_colon),
1691     (var3_a_as_in_ah,'ar-',False),
1692     (var4_a_as_in_ah,'ar-',False),
1693     ('uh',u_as_in_but),
1694     (u'\u0259:',e_as_in_herd),
1695     ('ai',eye),
1696     ('ch',ch),
1697     ('e',e_as_in_them),
1698     ('3:',ar_as_in_year),
1699      (a_as_in_air,'e:',False),
1700      (var1_a_as_in_air,'e:',False),
1701      (var2_a_as_in_air,'e:',False),
1702      (var3_a_as_in_air,'e:',False),
1703      (var4_a_as_in_air,'e:',False),
1704     (u'ei',a_as_in_ate),
1705     (u'\xe6i',var1_a_as_in_ate),
1706     ('g',g),
1707     ('i',i_as_in_it), (var1_i_as_in_it,'i',False),
1708     ('eeuh-',ear), (var2_ear,'eeuh-',False),
1709     ('ee',e_as_in_eat), (var1_e_as_in_eat,'ee',False),
1710     ('j',j_as_in_jump),
1711     ('ng',ng),
1712     ('o',o_as_in_go),
1713     (var2_o_as_in_go,'o',False), # override unicode-ipa
1714     (var1_u_as_in_but,'o',False), # ditto (?? '+'?)
1715     ('oy',oy_as_in_toy), (var1_oy_as_in_toy,'oy',False),
1716     ('r',r),
1717     ('sh',sh),
1718     (var1_t,'t',False),
1719     ('th',th_as_in_think),
1720     ('or',oor_as_in_poor),
1721     (var1_oor_as_in_poor,'or',False),
1722     ('u',opt_u_as_in_pull), ('oo',oo_as_in_food),
1723      (var1_oo_as_in_food,'oo',False),
1724      (var2_oo_as_in_food,'oo',False),
1725      (close_to_or,'or',False),
1726      (var1_close_to_or,'or',False),
1727      (var2_close_to_or,'or',False),
1728      (var1_w,'w',False),
1729     ('y',y),
1730     ('3',ge_of_blige_etc),
1731      cleanup_regexps=[('-$','')],
1732     cvtOut_func=None,
1733   ),
1734
1735   "braille-ipa" : makeDic(
1736     "IPA symbols in Braille (2008 BANA standard).  By default Braille ASCII is output; if you prefer to see the Braille dots via Unicode, set the BRAILLE_UNICODE environment variable.", # BANA = Braille Authority of North America.  TODO: check if the UK accepted this standard.
1737     # TODO: add Unicode IPA signs that aren't used in English IPA, so we can do a general IPA conversion
1738     ('_B',primary_stress),
1739     ('_2',secondary_stress),
1740     ('*',a_as_in_ah),
1741     ('3',ipa_colon),
1742     ('*3',var3_a_as_in_ah),
1743     ('*#',var4_a_as_in_ah),
1744     ('A3',var5_a_as_in_ah),
1745     ('%',a_as_in_apple),
1746     ('A',a_as_in_apple,False),
1747     ('+',u_as_in_but),
1748     ('4*',o_as_in_orange),
1749     (var1_o_as_in_orange,'*',False),
1750     ('<',var2_o_as_in_orange),
1751     ('A(',o_as_in_now),
1752     ('%<',var1_o_as_in_now),
1753     ('5',a_as_in_ago),
1754     ('53',e_as_in_herd),
1755     ('5"R.',var1_a_as_in_ago),
1756     ('A/',eye),
1757     ('*E',var1_eye),
1758     ('B',b),
1759     ('T:',ch),
1760     ('T":.',ch,False),
1761     ('D',d),
1762     (']',th_as_in_them),
1763     ('>',e_as_in_them),
1764     ('E',var1_e_as_in_them),
1765     ('4>3',ar_as_in_year), # (from \u025c\u02d0; TODO: check what happens to \u025d)
1766     ('>5',a_as_in_air),
1767     ('>#',var1_a_as_in_air),
1768     ('E3',var2_a_as_in_air),
1769     ('>3',var3_a_as_in_air),
1770     ('E5',var4_a_as_in_air),
1771     ('E/',a_as_in_ate),
1772     ('%/',var1_a_as_in_ate),
1773     ('F',f),
1774     ('G',g),
1775     ('H',h),
1776     ('/',i_as_in_it),
1777     ('0I',var1_i_as_in_it),
1778     ('/5',ear),
1779     ('/#',var1_ear),
1780     ('/#5',var2_ear), # ?
1781     ('I',e_as_in_eat),
1782     ('I3',var1_e_as_in_eat),
1783     ('D!',j_as_in_jump),
1784     ('K',k),
1785     ('X',opt_scottish_loch),
1786     ('L',l),
1787     ('D6L',var1_l),
1788     ('M',m),
1789     ('N',n),
1790     ('$',ng),
1791     ('5(',o_as_in_go),
1792     ('O',var1_o_as_in_go),
1793     ('O(',var2_o_as_in_go),
1794     ('50U',var1_u_as_in_but),
1795     ('</',oy_as_in_toy),
1796     ('O/',var1_oy_as_in_toy),
1797     ('P',p),
1798     ('#',r),
1799     (var1_r,'R',False),
1800     ('S',s),
1801     (':',sh),
1802     ('T',t),
1803     ('6R',var1_t),
1804     ('.?',th_as_in_think),
1805     ('(5',oor_as_in_poor),
1806     ('(#',var1_oor_as_in_poor),
1807     ('(',opt_u_as_in_pull),
1808     ('0U3',oo_as_in_food),
1809     ('U3',var1_oo_as_in_food),
1810     ('U',var2_oo_as_in_food),
1811     ('<3',close_to_or),
1812     (var1_close_to_or,'<',False),
1813     ('O3',var2_close_to_or),
1814     ('V',v),
1815     ('W',w),
1816     ('6W',var1_w),
1817     ('J',y),
1818     ('Z',z),
1819     ('!',ge_of_blige_etc),
1820     ('2',glottal_stop),
1821     lex_filename=ifset("BRAILLE_UNICODE","words-ipa.txt","words-ipa.brl"), # write-only for now
1822     lex_type = "document",
1823     # inline_format=",7%s7'", # -> do this in cleanup_func so it's included in BRAILLE_UNICODE if necessary
1824     lex_entry_format="%s = %s\n", # ditto with the markers
1825     word_separator=" ",phoneme_separator="",
1826     stress_comes_before_vowel=True,
1827     safe_to_drop_characters=True, # TODO: really?
1828     cleanup_func=lambda r:ifset("BRAILLE_UNICODE",ascii_braille_to_unicode,lambda x:x)(",7"+r+"7'"),
1829     cvtOut_func=unicode_to_ascii_braille,
1830   ),
1831
1832   "latex-ipa" : makeDic(
1833     'IPA symbols for typesetting in LaTeX using the "tipa" package',
1834     ('.',syllable_separator,False),
1835     ('"',primary_stress),
1836     ('\\textsecstress{}',secondary_stress),
1837     ('\\#',text_sharp),
1838     ('\\_',text_underline),
1839     ('?',text_question),
1840     ('!',text_exclamation),
1841     (',',text_comma),
1842     ('A',a_as_in_ah),
1843     (':',ipa_colon),
1844     ('A:',var3_a_as_in_ah),
1845     ('A\\textturnr{}',var4_a_as_in_ah),
1846     ('a:',var5_a_as_in_ah),
1847     ('\\ae{}',a_as_in_apple),
1848     ('2',u_as_in_but),
1849     ('6',o_as_in_orange),
1850     (var1_o_as_in_orange,'A',False),
1851     ('O',var2_o_as_in_orange),
1852     ('aU',o_as_in_now),
1853     ('\\ae{}O',var1_o_as_in_now),
1854     ('@',a_as_in_ago),
1855     ('@:',e_as_in_herd),
1856     ('\\textrhookschwa{}',var1_a_as_in_ago),
1857     ('aI',eye),
1858     ('Ae',var1_eye),
1859     ('b',b),
1860     ('tS',ch),
1861     ('d',d),
1862     ('D',th_as_in_them),
1863     ('E',e_as_in_them),
1864     ('e',var1_e_as_in_them),
1865     ('3:',ar_as_in_year),
1866     ('E@',a_as_in_air),
1867     ('E\\textturnr{}',var1_a_as_in_air),
1868     ('e:',var2_a_as_in_air),
1869     ('E:',var3_a_as_in_air),
1870     ('e@',var4_a_as_in_air),
1871     ('eI',a_as_in_ate),
1872     ('\\ae{}I',var1_a_as_in_ate),
1873     ('f',f),
1874     ('g',g),
1875     ('h',h),
1876     ('I',i_as_in_it),
1877     ('1',var1_i_as_in_it),
1878     ('I@',ear),
1879     ('I\\textturnr{}',var1_ear),
1880     ('I@\\textturnr{}',var2_ear), # ?
1881     ('i',e_as_in_eat),
1882     ('i:',var1_e_as_in_eat),
1883     ('dZ',j_as_in_jump),
1884     ('k',k),
1885     ('x',opt_scottish_loch),
1886     ('l',l),
1887     ('d\\textltilde{}',var1_l),
1888     ('m',m),
1889     ('n',n),
1890     ('N',ng),
1891     ('@U',o_as_in_go),
1892     ('o',var1_o_as_in_go),
1893     ('oU',var2_o_as_in_go),
1894     ('@0',var1_u_as_in_but),
1895     ('OI',oy_as_in_toy),
1896     ('oI',var1_oy_as_in_toy),
1897     ('p',p),
1898     ('\\textturnr{}',r),
1899     (var1_r,'r',False),
1900     ('s',s),
1901     ('S',sh),
1902     ('t',t),
1903     ('R',var1_t),
1904     ('T',th_as_in_think),
1905     ('U@',oor_as_in_poor),
1906     ('U\\textturnr{}',var1_oor_as_in_poor),
1907     ('U',opt_u_as_in_pull),
1908     ('0:',oo_as_in_food),
1909     ('u:',var1_oo_as_in_food),
1910     ('u',var2_oo_as_in_food),
1911     ('O:',close_to_or),
1912     (var1_close_to_or,'O',False),
1913     ('o:',var2_close_to_or),
1914     ('v',v),
1915     ('w',w),
1916     ('\\textturnw{}',var1_w),
1917     ('j',y),
1918     ('z',z),
1919     ('Z',ge_of_blige_etc),
1920     ('P',glottal_stop),
1921     lex_filename="words-ipa.tex", # write-only for now
1922     lex_type = "document",
1923     lex_header = r'\documentclass[12pt,a4paper]{article} \usepackage[safe]{tipa} \usepackage{longtable} \begin{document} \begin{longtable}{ll}',
1924     lex_entry_format=r"%s & \textipa{%s}\\"+"\n",
1925     lex_footer = r"\end{longtable}\end{document}"+"\n",
1926     inline_format = "\\textipa{%s}",
1927     inline_oneoff_header = r"% In preamble, put \usepackage[safe]{tipa}"+"\n", # (the [safe] part is recommended if you're mixing with other TeX)
1928     word_separator=" ",phoneme_separator="",
1929     clause_separator=r"\\"+"\n",
1930     stress_comes_before_vowel=True,
1931     safe_to_drop_characters=True, # TODO: really?
1932   ),
1933
1934   "pinyin-approx" : makeDic(
1935     "Rough approximation using roughly the spelling rules of Chinese Pinyin (for getting Chinese-only voices to speak some English words; works with some words better than others)", # write-only for now
1936     ('4',primary_stress),
1937     ('2',secondary_stress),
1938     ('a5',a_as_in_ah),
1939     ('ya5',a_as_in_apple),
1940     ('e5',u_as_in_but),
1941     ('yo5',o_as_in_orange),
1942     ('ao5',o_as_in_now),
1943     (e_as_in_herd,'e5',False),
1944     ('ai5',eye),
1945     ('bu0',b),
1946     ('che0',ch),
1947     ('de0',d),
1948     ('ze0',th_as_in_them),
1949     ('ye5',e_as_in_them),
1950     (a_as_in_air,'ye5',False),
1951     ('ei5',a_as_in_ate),
1952     ('fu0',f),
1953     ('ge0',g),
1954     ('he0',h),
1955     ('yi5',i_as_in_it),
1956     ('yi3re5',ear),
1957     (e_as_in_eat,'yi5',False),
1958     ('zhe0',j_as_in_jump),
1959     ('ke0',k),
1960     ('le0',l),
1961     ('me0',m),
1962     ('ne0',n),
1963     ('eng0',ng),
1964     ('ou5',o_as_in_go),
1965     ('ruo2yi5',oy_as_in_toy),
1966     ('pu0',p),
1967     ('re0',r),
1968     ('se0',s),
1969     ('she0',sh),
1970     ('te0',t),
1971     (th_as_in_think,'zhe0',False),
1972     (oor_as_in_poor,'wu5',False),
1973     ('yu5',oo_as_in_food),
1974     ('huo5',close_to_or),
1975     (v,'fu0',False),
1976     ('wu0',w),
1977     ('yu0',y),
1978     (z,'ze0',False),
1979     (ge_of_blige_etc,'zhe0',False),
1980     approximate_missing=True,
1981     lex_filename="words-pinyin-approx.txt", # write-only for now
1982     lex_type = "text",
1983     lex_header = "Pinyin approxmations (very approximate!)\n----------------------------------------\n",
1984     lex_entry_format = "%s ~= %s\n",
1985     word_separator=" ",phoneme_separator="",
1986     cleanup_regexps=[
1987       ("te0ye","tie"),
1988       ("e0e5","e5"),("([^aeiou][uo])0e(5)",r"\1\2"),
1989       ("yu0y","y"),
1990       ("wu0yo5","wo5"),
1991       ("([bdfghklmnpwz])[euo]0ei",r"\1ei"),
1992       ("([bdghklmnpstwz])[euo]0ai",r"\1ai"),
1993       ("([ghklmnpstyz])[euo]0ya",r"\1a"),("([ghklmnpstz])a([0-5]*)ne0",r"\1an\2"),
1994       ("([bdfghklmnpstwyz])[euo]0a([1-5])",r"\1a\2"),
1995       ("([bdjlmnpt])[euo]0yi",r"\1i"),("([bjlmnp])i([1-5]*)ne0",r"\1in\2"),
1996       ("([zs])he0ei",r"\1hei"),
1997       ("([dfghklmnprstyz])[euo]0ou",r"\1ou"),
1998       ("([dghklnrst])[euo]0huo",r"\1uo"),
1999       ("([bfpm])[euo]0huo",r"\1o"),
2000       ("([bdghklmnprstyz])[euo]0ao",r"\1ao"),
2001       ("([zcs])h[eu]0ao",r"\1hao"),
2002       ("re0r","r"),
2003       ("zhe0ne0","zhun5"),
2004       ("54","4"),
2005       ("52","2"),
2006       ("([bdjlmnpty])i([1-9])eng0",r"\1ing\2"),
2007       ("ya([1-9])eng0",r"yang\1"),
2008       ("ya([1-9])ne0",r"an\1"),
2009       ("ye([1-9])ne0",r"yan\1"),("([wr])[eu]0yan",r"\1en"),
2010       ("yi([1-9])ne0",r"yin\1"),
2011
2012       ("yu0","yu5"),("eng0","eng5"), # they won't work unvoiced anyway
2013       ("0","5"), # comment out if the synth supports 'tone 0 for unvoiced'
2014       #("[euo]0","0"), # comment in if it expects consonants only when doing that
2015     ],
2016   ),
2017
2018   "kana-approx" : makeDic(
2019   "Rough approximation using kana (for getting Japanese computer voices to speak some English words; works with some words better than others).  Set KANA_TYPE environment variable to hiragana or katakana (which can affect the sounds of some voices); default is hiragana", # for example on Mac OS 10.7+ (with Japanese voice installed in System Preferences) try PHONES_PIPE_COMMAND='say -v Kyoko' (this voice has a built-in converter from English as well, but lexconvert --phones kana-approx can work better with some complex words, although the built-in converter does seem to have access to slightly more phonemes and can therefore produce words like "to" better).  Default is hiragana because I find hiragana easier to read than katakana, although the Kyoko voice does seem to be able to say 'v' a little better when using kata.  Mac OS 10.7+'s Korean voices (Yuna and Narae) can also read kana, and you could try doing a makeVariantDic and adding in some Korean jamo letters for them (you'd be pushed to represent everything in jamo but kana+jamo seems more hopeful in theory), but again some words work better than others (not all phonetic combinations are supported and some words aren't clear at all).
2020     # This kana-approx format is 'write-only' for now (see comment in cleanup_regexps re possible reversal)
2021     (u'\u30fc',primary_stress),
2022     (secondary_stress,ifset('KANA_MORE_EMPH',u'\u30fc'),False), # set KANA_MORE_EMPH environment variable if you want to try doubling the secondary-stressed vowels as well (doesn't always work very well; if it did, I'd put this line in a makeVariantDic called kana-approx-moreEmph or something)
2023     # The following Unicode codepoints are hiragana; KANA_TYPE is handled by cleanup_func below
2024     (u'\u3042',a_as_in_apple),
2025     (u'\u3044',e_as_in_eat),
2026     (u'\u3046',oo_as_in_food),
2027     (u'\u3048',e_as_in_them),
2028     (u'\u304a',o_as_in_orange),
2029     (u'\u3042\u3044',eye), # ai
2030     (u'\u3042\u304a',o_as_in_now), # ao
2031     (u'\u3048\u3044',a_as_in_ate), # ei
2032     (u'\u304a\u3044',oy_as_in_toy), # oi
2033     (u'\u304a\u3046',o_as_in_go), # ou
2034     (a_as_in_ah,u'\u3042',False),
2035     (a_as_in_ago,u'\u3046\u304a',False), # TODO: \u3042, \u304a or \u3046 depending on the word?
2036     (e_as_in_herd,u'\u3042',False), # TODO: really?
2037     (i_as_in_it,u'\u3044',False), # TODO: really?
2038     (u_as_in_but,u'\u3046',False), # TODO: really?
2039     (ar_as_in_year,u'\u3048',False), # TODO: really?
2040     (ear,u'\u3044\u304a',False), # TODO: really?
2041     (a_as_in_air,u'\u3048',False), # TODO: really?
2042     (oor_as_in_poor,u'\u304a',False), # TODO: really?
2043     (close_to_or,u'\u304a\u30fc'), # TODO: really?
2044     (u'\u3076',b), # bu (with vowel replacements later)
2045     (u'\u3061\u3047',ch), # chu (ditto)
2046     (u'\u3065',d), # du (and so on)
2047     (u'\u3066\u3085',th_as_in_think), (th_as_in_them,u'\u3066\u3085',False),
2048     (u'\u3075',f),
2049     (u'\u3050',g),
2050     (u'\u306f',h), # ha (as hu == fu)
2051     (u'\u3058\u3085',j_as_in_jump), (ge_of_blige_etc,u'\u3058\u3085',False),
2052     (u'\u304f',k),
2053     (u'\u308b',l), (r,u'\u308b',False),
2054     (u'\u3080',m),
2055     (u'\u306c',n),
2056     (u'\u3093\u3050',ng),
2057     (u'\u3077',p),
2058     (u'\u3059',s),
2059     (u'\u3057\u3085',sh),
2060     (u'\u3064',t),
2061     (u'\u308f',w), # use 'wa' (as 'wu' == 'u')
2062     (v,ifset('KANA_V_AS_W',u'\u308f',u'\u3094'),False), # TODO: document KANA_V_AS_W variable.  Is vu always supported? (it doesn't seem to show up in all fonts)
2063     (u'\u3086',y),
2064     (u'\u305a',z),
2065     lex_filename="words-kana-approx.txt",
2066     lex_type = "text",
2067     lex_header = "Kana approxmations (very approximate!)\n--------------------------------------\n",
2068     lex_entry_format = "%s ~= %s\n",
2069     word_separator=" ",phoneme_separator="",
2070     clause_separator=u"\u3002\n".encode('utf-8'),
2071     cleanup_regexps=[(u"\u306c$",u"\u3093\u30fc"), # TODO: or u"\u3093\u3093" ?
2072        # now the vowel replacements (bu+a -> ba, etc) (in most cases these can be reversed into cvtOut_regexps if you want to use the kana-approx table to convert hiragana into approximate English phonemes (plus add a (u"\u3093\u30fc*",u"\u306c") and perhaps de-doubling rules to convert back to emphasis) but the result is unlikely to be any good)
2073        (u"\u3076\u3042",u"\u3070"),(u"\u3076\u3044",u"\u3073"),(u"\u3076\u3048",u"\u3079"),(u"\u3076\u304a",u"\u307c"),(u"\u3076\u3046",u"\u3076"),
2074        (u"\u3061\u3085\u3042",u"\u3061\u3083"),(u"\u3061\u3085\u3046",u"\u3061\u3085"),(u"\u3061\u3085\u3048",u"\u3061\u3047"),(u"\u3061\u3085\u304a",u"\u3061\u3087"),(u"\u3061\u3085\u3044",u"\u3061"),
2075        (u"\u3065\u3042",u"\u3060"),(u"\u3065\u3044",u"\u3062"),(u"\u3065\u3048",u"\u3067"),(u"\u3065\u304a",u"\u3069"),(u"\u3065\u3046",u"\u3065"),
2076        (u"\u3066\u3085\u3042",u"\u3066\u3083"),(u"\u3066\u3085\u3044",u"\u3066\u3043"),(u"\u3066\u3043\u3046",u"\u3066\u3085"),(u"\u3066\u3085\u3048",u"\u3066\u3047"),(u"\u3066\u3085\u304a",u"\u3066\u3087"),
2077        (u"\u3075\u3042",u"\u3075\u3041"),(u"\u3075\u3044",u"\u3075\u3043"),(u"\u3075\u3048",u"\u3075\u3047"),(u"\u3075\u304a",u"\u3075\u3049"),(u"\u3075\u3046",u"\u3075"),
2078        (u"\u306f\u3044",u"\u3072"),(u"\u306f\u3046",u"\u3075"),(u"\u306f\u3048",u"\u3078"),(u"\u306f\u304a",u"\u307b"),(u"\u306f\u3042",u"\u306f"),
2079        (u"\u3050\u3042",u"\u304c"),(u"\u3050\u3044",u"\u304e"),(u"\u3050\u3048",u"\u3052"),(u"\u3050\u304a",u"\u3054"),(u"\u3050\u3046",u"\u3050"),
2080        (u"\u3058\u3085\u3042",u"\u3058\u3083"),(u"\u3058\u3085\u3046",u"\u3058\u3085"),(u"\u3058\u3085\u3048",u"\u3058\u3047"),(u"\u3058\u3085\u304a",u"\u3058\u3087"),(u"\u3058\u3085\u304a",u"\u3058"),
2081        (u"\u304f\u3042",u"\u304b"),(u"\u304f\u3044",u"\u304d"),(u"\u304f\u3048",u"\u3051"),(u"\u304f\u304a",u"\u3053"),(u"\u304f\u3046",u"\u304f"),
2082        (u"\u308b\u3042",u"\u3089"),(u"\u308b\u3044",u"\u308a"),(u"\u308b\u3048",u"\u308c"),(u"\u308b\u304a",u"\u308d"),(u"\u308b\u3046",u"\u308b"),
2083        (u"\u3080\u3042",u"\u307e"),(u"\u3080\u3044",u"\u307f"),(u"\u3080\u3048",u"\u3081"),(u"\u3080\u304a",u"\u3082"),(u"\u3080\u3046",u"\u3080"),
2084        (u"\u306c\u3042",u"\u306a"),(u"\u306c\u3044",u"\u306b"),(u"\u306c\u3048",u"\u306d"),(u"\u306c\u304a",u"\u306e"),(u"\u306c\u3046",u"\u306c"),
2085        (u"\u3077\u3042",u"\u3071"),(u"\u3077\u3044",u"\u3074"),(u"\u3077\u3048",u"\u307a"),(u"\u3077\u304a",u"\u307d"),(u"\u3077\u3046",u"\u3077"),
2086        (u"\u3059\u3042",u"\u3055"),(u"\u3059\u3048",u"\u305b"),(u"\u3059\u304a",u"\u305d"),(u"\u3059\u3046",u"\u3059"),
2087        (u"\u3057\u3085\u3042",u"\u3057\u3083"),(u"\u3057\u3085\u3046",u"\u3057\u3085"),(u"\u3057\u3085\u3048",u"\u3057\u3047"),(u"\u3057\u3085\u304a",u"\u3057\u3087"),(u"\u3057\u3085\u3044",u"\u3057"),
2088        (u"\u3064\u3042",u"\u305f"),(u"\u3064\u3044",u"\u3061"),(u"\u3064\u3048",u"\u3066"),(u"\u3064\u304a",u"\u3068"),(u"\u3064\u3046",u"\u3064"),
2089        (u"\u3086\u3042",u"\u3084"),(u"\u3086\u3048",u"\u3044\u3047"),(u"\u3086\u304a",u"\u3088"),(u"\u3086\u3046",u"\u3086"),
2090        (u"\u305a\u3042",u"\u3056"),(u"\u305a\u3044",u"\u3058"),(u"\u305a\u3048",u"\u305c"),(u"\u305a\u304a",u"\u305e"),(u"\u305a\u3046",u"\u305a"),
2091        (u"\u308f\u3044",u"\u3046\u3043"),(u"\u308f\u3046",u"\u3046"),(u"\u308f\u3048",u"\u3046\u3047"),(u"\u308f\u304a",u"\u3092"),(u"\u308f\u3042",u"\u308f"),
2092        (u'\u3046\u3043\u3066\u3085', u'\u3046\u3043\u3065'), # sounds a bit better for words like 'with'
2093        (u'\u3085\u3046',u'\u3085'), # and 'the' (especially with a_as_in_ago mapping to u'\u3046\u304a'; it's hard to get a convincing 'the' though, especially in isolation)
2094        (u'\u3050\u3050',u'\u3050'), # gugu -> gu, sometimes comes up with 'gl-' combinations
2095        (u'\u30fc\u30fc+',u'\u30fc'), # in case we put 30fc in the table AND a stress mark has been applied to it
2096        (u'^(.)$',u'\\1\u30fc'), # lengthen any word that ends up as a single kana (otherwise can be clipped badly)
2097     (u'^([\u3042\u3070\u3060\u304c\u304b\u3089\u307e\u306a\u3071\u3055\u305f\u3084\u3056\u308f]\u3044)$',u'\\1\u30fc'), # ditto for -ai (TODO: -ao might need lengthening sometimes?? depends on context.  -ei, -oi, -ou seem OK)
2098     ],
2099     cleanup_func = hiragana_to_katakana
2100   ),
2101
2102   "deva-approx" : makeDic(
2103   "Rough approximation using Devanagari (for getting Indian computer voices to speak some English words; works with some words better than others); can also be used to approximate Devanagari words in English phonemes",
2104     (u'\u02c8',primary_stress),
2105     (u'\u093e',a_as_in_ah),(u'\u0906',a_as_in_ah,False),
2106     (u'\u0905',u_as_in_but),
2107     (u'\u092c',b),
2108     (u'\u091b',ch),(u'\u091a',ch,False),
2109     (u'\u0926',d),(u'\u0921',d,False), # TODO: check which sounds better for reading English words
2110     (u'\u0920',th_as_in_them), # (very approximate)
2111     (u'\u0948',e_as_in_them),(u'\u0910',e_as_in_them,False),
2112     (u'\u0947',a_as_in_ate),(u'\u090f',a_as_in_ate,False),
2113     (u'\u092b\u093c',f),
2114     (u'\u0917',g),
2115     (u'\u0917\u093c',g,False), # (Hindi; differs in others)
2116     (u'\u0939',h),(u'\u0903',h,False),
2117     (u'\u093f',i_as_in_it),(u'\u0907',i_as_in_it,False),
2118     (u'\u0940',e_as_in_eat),(u'\u0908',e_as_in_eat,False),
2119     (u'\u091c',j_as_in_jump),
2120     (u'\u0915',k),(u'\u0916',k,False),
2121     (u'\u0916\u093c',opt_scottish_loch),
2122     (u'\u0915\u093c',opt_scottish_loch,False), # ?
2123     (u'\u0932',l),
2124     (u'\u092e',m),
2125     (u'\u0928',n),(u'\u0923',n,False),
2126     (u'\u0902',ng),
2127     (u'\u092a',p),
2128     (u'\u092b',f,False), # (Hindi; p in some others?)
2129     (u'\u0930',r),(u'\u0921\u093c',r,False),
2130     (u'\u0938',s),
2131     (u'\u0936',sh), (u'\u0937',sh,False),
2132     (u'\u091f',t),(u'\u0924',t,False),(u'\u0925',t,False),
2133     (u'\u0941',opt_u_as_in_pull),(u'\u0909',opt_u_as_in_pull,False),
2134     (u'\u0942',oo_as_in_food),(u'\u090a',oo_as_in_food,False),
2135     (u'\u094c',close_to_or),(u'\u0914',close_to_or,False),
2136     (u'\u094b',opt_ol_as_in_gold),(u'\u0913',opt_ol_as_in_gold,False),
2137     (u'\u0935',v),(w,u'\u0935',False),
2138     (u'\u092f',y),
2139     (u'\u091c\u093c',z),
2140     (u'\u091d\u093c',ge_of_blige_etc),
2141     (u'\u0901',ipa_colon),
2142     word_separator=" ",phoneme_separator="",
2143     stress_comes_before_vowel=True,
2144     safe_to_drop_characters=True, # it's an approximation
2145     approximate_missing=True,
2146     cleanup_regexps=[
2147        # add virama if consonant not followed by vowel, and delete default vowel after consonant:
2148        (u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u094d'),(u'(?<=[\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0905',u''),(u'(.)\u094d\u02c8',u'\u02c8\\1'),
2149        # replace vowel signs with vowel letters if not preceded by consonants:
2150        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093e',u'\u0906'),
2151        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u093f',u'\u0907'),
2152        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0940',u'\u0908'),
2153        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0941',u'\u0909'),
2154        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0942',u'\u090a'),
2155        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0947',u'\u090f'),
2156        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u0948',u'\u0910'),
2157        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094b',u'\u0913'),
2158        (u'(?<![\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939\u093c])\u094c',u'\u0914')],
2159     cvtOut_func=unicode_preprocess,
2160     cvtOut_regexps=[
2161        # add default vowel when necessary:
2162        (u'([\u0902\u0903\u0915-\u0917\u091a-\u091d\u091f-\u0928\u092a-\u0930\u0932\u0935-\u0939]\u093c?)(?![\u0905\u094d\u093e-\u0942\u0947\u0948\u094b\u094c])',u'\\1\u0905'),(u'\u094d',u''),
2163        # 'add h' approximations:
2164        (u'\u092d',u'\u092c\u0939'),(u'\u0927',u'\u0922\u0939'),(u'\u0918',u'\u0917\u0939'),(u'\u091d',u'\u091c\u0939'),(u'\u0922\u093c',u'\u0921\u093c\u0939'),
2165     ]),
2166
2167   "names" : makeDic(
2168     "Lexconvert internal phoneme names (sometimes useful with the --phones option while developing new formats)",
2169      *[(phName,phVal) for phName,phVal in phonemes.items()])}
2170
2171 # The mainopt_...() functions are the main options
2172 # (if you implement a new one, main() will detect it);
2173 # 1st line of doc string should be parameter summary
2174 # (start the doc string with \n if no parameters); if 1st
2175 # character of doc string is * then this function is put
2176 # among the first in the help (otherwise alphabetically).
2177 # If function returns a string, that's taken to be a
2178 # message to be printed with error exit.  Same if it raises
2179 # an exception of type Message.
2180
2181 def mainopt_try(i):
2182    """*<format> [<pronunciation>]
2183 Convert input from <format> into eSpeak and try it out.
2184 (Requires the 'espeak' command.)
2185 E.g.: python lexconvert.py --try festival h @0 l ou1
2186  or: python lexconvert.py --try unicode-ipa '\\u02c8\\u0279\\u026adn\\u0329' (for Unicode put '\\uNNNN' or UTF-8)"""
2187    format = sys.argv[i+1]
2188    if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2189    for phones in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2190       espeak = convert(phones,format,'espeak')
2191       w = os.popen("espeak -x","w")
2192       getBuf(w).write(markup_inline_word("espeak",espeak)+as_utf8('\n')) # separate process each item for more responsiveness from the console (sending 'maybe' to getInputText means won't lose efficiency if not read from console)
2193
2194 def mainopt_trymac(i):
2195    """*<format> [<pronunciation>]
2196 Convert phonemes from <format> into Mac and try it using the Mac OS 'say' command"""
2197    format = sys.argv[i+1]
2198    if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2199    for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2200       mac = convert(resp,format,'mac')
2201       toSay = markup_inline_word("mac",mac)
2202       print(as_printable(toSay))
2203       w = os.popen(macSayCommand()+" -v Vicki","w")
2204       getBuf(w).write(toSay) # Need to specify a voice because the default voice might not be able to take Apple phonemes.  Vicki has been available since 10.3, as has the 'say' command (previous versions need osascript, see Gradint's code)
2205
2206 def mainopt_trymac_uk(i):
2207    """*<format> [<pronunciation>]
2208 Convert phonemes from <format> and try it with Mac OS British voices (see --mac-uk for details)"""
2209    assert sys.version_info[0]==2, "--trymac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"
2210    format = sys.argv[i+1]
2211    if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2212    for resp in getInputText(i+2,"phonemes in "+format+" format",'maybe'):
2213      macuk = convert(resp,format,'mac-uk')
2214      m = MacBritish_System_Lexicon("",os.environ.get("MACUK_VOICE","Daniel"))
2215      try:
2216       try: m.speakPhones(macuk.split())
2217       finally: m.close()
2218      except KeyboardInterrupt:
2219       sys.stderr.write("Interrupted\n")
2220
2221 def mainopt_phones(i):
2222    """*<format> [<words>]
2223 Use eSpeak to convert text to phonemes, and then convert the phonemes to format 'format'.
2224 E.g.: python lexconvert.py --phones unicode-ipa This is a test sentence.
2225 Set environment variable PHONES_PIPE_COMMAND to an additional command to which to write the phones as well as standard output.  (If standard input is a terminal then this will be done separately after each line.)
2226 (Some commercial speech synthesizers do not work well when driven entirely from phonemes, because their internal format is different and is optimised for normal text.)
2227 Set format to 'all' if you want to see the phonemes in ALL supported formats."""
2228    format = sys.argv[i+1]
2229    if format=="example": return "The 'example' format cannot be used with --phones; try --convert, or did you mean --phones festival" # could allow example anyway as it's basically Festival, but save confusion as eSpeak might not generate the same phonemes if our example words haven't been installed in the system's eSpeak.  (Still allow it to be used in --try etc though.)
2230    if not format in lexFormats and not format=="all": return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2231    hadOneoff = False
2232    for response in getInputText(i+2,"text",'maybe'):
2233     response = pipeThroughEspeak(as_utf8(response).replace(u'\u2032'.encode('utf-8'),as_utf8('')).replace(u'\u00b4'.encode('utf-8'),as_utf8('')).replace(u'\u02b9'.encode('utf-8'),as_utf8('')).replace(u'\u00b7'.encode('utf-8'),as_utf8(''))) # (remove any 2032 and b7 pronunciation marks before passing to eSpeak)
2234     if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2235     if format=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")
2236     else: formats = [format]
2237     for format in formats:
2238        def out(doOneoff=True):
2239           if len(formats)>1: writeFormatHeader(format)
2240           if doOneoff: getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_oneoff_header")))
2241           getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_header")))
2242           output_clauses(format,convert(parseIntoWordsAndClauses("espeak",response),"espeak",format))
2243           getBuf(sys.stdout).write(as_utf8(checkSetting(format,"inline_footer")))
2244           print("")
2245           sys.stdout.flush() # in case it's being piped
2246        out(not hadOneoff) ; hadOneoff = True
2247        if os.environ.get("PHONES_PIPE_COMMAND",""):
2248           o,sys.stdout = sys.stdout,os.popen(os.environ["PHONES_PIPE_COMMAND"],'w')
2249           out()
2250           sys.stdout = o
2251
2252 def mainopt_ruby(i):
2253    """*<format> [<words>]
2254 Like --phones but outputs the result as HTML RUBY markup, with each word's pronunciation symbols placed above the corresponding English word.
2255 E.g.: python lexconvert.py --ruby unicode-ipa This is a test sentence.
2256 This option is made more complicated by the fact that different versions of eSpeak may space the phoneme output differently, for example when handling numbers; if your eSpeak version is not recognised then all numbers are unannotated. Anyway you are advised not to rely on this option working with the new development NG versions of eSpeak. If the version you have behaves unexpectedly, words and phonemes output might lose synchronisation. However this option is believed to be stable when used with simple text and the original eSpeak.
2257 You can optionally set the RUBY_GRADINT_CGI environment variable to the URL of an instance of Gradint Web Edition to generate audio links for each word.  If doing this in a Web Adjuster filter, see comments in the lexconvert source for setup details."""
2258    # htmlFilter with --htmlText of course.  Set separator to two newlines and copy the generated 'h5a' function (from a manual run or the lexconvert source) into Adjuster's headAppend option (but don't expect HTML5 audio to work from Adjuster's submitBookmarklet option; pronunciation links will take you off the page if it doesn't).
2259    # Use double newlines as single newlines are used in the h5a script; adding that script via bookmarklet doesn't always run it
2260    format = sys.argv[i+1]
2261    if format=="example": return "The 'example' format cannot be used with --ruby; did you mean festival?" # as above
2262    elif format=="all": return "The --phones all option cannot be used with --ruby" # (well you could implement it if you want but the resulting ruby would be quite unwieldy)
2263    if not format in lexFormats: return "No such format "+repr(format)+" (use --formats to see a list of formats)"
2264    text = as_utf8(getInputText(i+2,"text")).replace(u'\u2019'.encode('utf-8'),as_utf8("'")).replace(u'\u2032'.encode('utf-8'),as_utf8("'")).replace(u'\u00b4'.encode('utf-8'),as_utf8("'")).replace(u'\u02b9'.encode('utf-8'),as_utf8("'")).replace(u'\u00b7'.encode('utf-8'),as_utf8('')).replace(u'\u00a0'.encode('utf-8'),as_utf8(' '))
2265    # eSpeak's basic idea of an alphabetical word (most versions?) -
2266    wordRegexps = [r"(?:[A-Z]+['?-])*(?:(?:(?<![A-z.])(?:[A-z]\.)+[A-z](?![A-z.]))|[A-Z]+[a-z](?![A-z])|[A-Z][A-Z]+(?![a-z][A-z])|[A-Z]?(?:[a-z]['?-]?)+|[A-Z])"]
2267    # A dot, when not part of an elipses, followed by a letter is pronounced "dot", and two of them are pronounced "dot dot":
2268    wordRegexps.append(r"(?<!\.\.)\.(?=[A-z])|(?<!\.)\.(?=\.[A-z])")
2269    # ! followed by a letter is pronounced "exclamation", and .! is "dotexclamation"; @ symbols similarly; copyright
2270    atEtc = u"(?:[@!:]|\u00a9)*".encode('utf-8')
2271    wordRegexps.append(as_utf8(r"\.?[!@]+(?=[A-z])|(?<![A-z])@")+atEtc+as_utf8("(?![A-z])|")+unichr(0xa9).encode('utf-8')+atEtc)
2272    # : between numbers if NOT followed by 2 digits:
2273    wordRegexps.append(r"(?<![A-z]):(?![A-z]|[0-9][0-9])")
2274    # - between numbers
2275    wordRegexps.append(r"(?<=[0-9])-(?=[0-9])")
2276    # TODO: if you paste in (e.g.) CJK characters, eSpeak will say "symbol-symbol-symbol" etc, but this is not accounted for by the above regexp so it'll go onto following words.
2277    vLine = espeak_version_line()
2278    if "1.45." in vLine:
2279       # This seems to work in eSpeak 1.45:
2280       # (TODO: test leading 0s & leading decimal)
2281       # a number of 4 digits or less (with any number of digits after the decimal point) is grouped as 1 word:
2282       wordRegexps.append(r"(?<![0-9])[0-9]{1,4}(?:\.[0-9]+)?(?!,?[0-9])")
2283       # and a number of 1 to 3 digits with any number of 000 or ,000 groups, with optional decimal point followed by any number of digits, OR when placed before an integer number of 3-digit groups, is grouped as 1 word:
2284       wordRegexps.append(r"[0-9]{1,3}(?:,?000)*(?:\.[0-9]+)?,?(?=(?:,?[0-9]{3,3})*,?(?:[^0-9]|$))")
2285       text2 = text
2286    elif "1.48." in vLine:
2287       # In eSpeak 1.48 the groups are smaller.
2288       # Decimal point and everything after it = individual
2289       wordRegexps.append(r"(?<=[0-9])\.(?=[0-9])")
2290       for places in range(25): # TODO: really want unbounded, but (?<=...) is fixed-length
2291          wordRegexps.append(r"(?<=[0-9]\."+"[0-9]"*places+r")[0-9]")
2292       # Number with a leading dot grouped as 1 word:
2293       wordRegexps.append(r"(?<![0-9])\.[0-9]+")
2294       # TODO: leading 0s (0000048 goes to 0 000 048)
2295       # For normal numbers:
2296       # A null string w. 3 or 6 digits to go and digits b4 shld match for 'thousand', 'million' (unless 3+ digits are leading 0s, or fewer than 3 leading 0s and whole thing begins with a 0, or it's part of a decimal expansion, in which case different rules apply, but (?<=...) must be fixed-length, so we need another one of these awful loops) :
2297       for prevDigits in range(10):
2298          for beforeThat in ["^",r"[^.0-9,]"]: # beginning of string, or something OTHER than a decimal point / num
2299             wordRegexps.append(r"(?<="+beforeThat+"[1-9]"+"[0-9,]"*prevDigits+r")(?<!,)(?<!000)(?# empty string )(?=(?:,?(?:[0-9]{3,3}))+(?:[^0-9]|$))")
2300       # 1-9 (not 0) with 2, 5 or 8 etc digits to go = "N-hundred-and" :
2301       wordRegexps.append(r"[1-9](?=[0-9][0-9](?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")
2302       # + 0 with 2 digits to go when preceded by digits = "and", as long as followed by at least one non-0:
2303       wordRegexps.append(r"(?<=[0-9,])0(?=(?:[0-9][1-9]|[1-9][0-9])(?:[^0-9,]|$))")
2304       # 1 or 2 digits with 0,3,6.. to go = "seventy-six" or whatever, as long as they're not both 0 :
2305       wordRegexps.append(r"(?:0[1-9]|[1-9][0-9]?)(?=(?:,?(?:[0-9]{3,3}))*(?:[^0-9]|$))")
2306       # 0 by itself (not preceded by digits) = "nought" :
2307       wordRegexps.append(r"(?<![0-9])0(?=[^0-9]|$)")
2308       wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:of|on|in|that|with|for|was) (?:the|a)(?= )")
2309       wordRegexps.insert(0,"(?:Of|On|In|That|With|For|Was) (?:the|a)(?= )")
2310       wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])not a(?= )")
2311       wordRegexps.insert(0,"(?<=[^A-Za-z0-9_-])(?:some|that) one(?= )")
2312       wordRegexps.insert(0,"(?:Some|That) one(?= )")
2313       text2 = text
2314    else: text2 = re.sub(r"\.?[0-9]+","",text) # unknown eSpeak version: don't annotate the numbers
2315    response = pipeThroughEspeak(text2)
2316    if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2317    gradint_cgi = os.environ.get("RUBY_GRADINT_CGI","")
2318    if gradint_cgi:
2319       linkStart,linkEnd = lambda w:maybe_bytes('<a href="',w)+maybe_bytes(gradint_cgi,w)+maybe_bytes('?js=[[',w)+w.replace(maybe_bytes('%',w),maybe_bytes('%25',w)).replace(maybe_bytes('&',w),maybe_bytes('%26',w))+maybe_bytes(']]&jsl=en" onclick="return h5a(this);">',w), '</a>'
2320       print(r"""<script><!-- // HTML5-audio function
2321 function h5a(link) {
2322  if (document.createElement) {
2323    var ae = document.createElement('audio');
2324    if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/mpeg'))) {
2325      ae.setAttribute('src', link.href);
2326      ae.play(); return false;
2327    } else if (ae.canPlayType && function(s){return s!="" && s!="no"}(ae.canPlayType('audio/ogg'))) {
2328      ae.setAttribute('src', link.href+"&filetype=ogg");
2329      ae.play(); return false; }
2330  } return true; }
2331 //--></script>""")
2332    else: linkStart,linkEnd = lambda w:maybe_bytes("",w), ""
2333    rubyList = []
2334    for clause in parseIntoWordsAndClauses("espeak",response):
2335       for w in clause:
2336          converted = convert(w,"espeak",format)
2337          if not converted: continue # e.g. a lone _:_:
2338          m = markup_inline_word(format,converted)
2339          rubyList.append(linkStart(w)+m.replace(maybe_bytes("&",m),maybe_bytes("&amp;",m)).replace(maybe_bytes("<",m),maybe_bytes("&lt;",m))+maybe_bytes(linkEnd,w))
2340    rubyList.reverse() # so can pop() left-to-right order
2341    # Write out re.sub ourselves, because (1) some versions of the library (e.g. on 2.7.12) try to do some things in-place, and we're using previous-context regexps that aren't compatible with previous things having been already <ruby>'ified, and (2) if we match a 0-length string, re.finditer won't ALSO return a non-0 length match starting in the same place, and we want both (so we're using wordRegexps as a list rather than an | expression)
2342    matches = {}
2343    debug = False # if True, will add ruby title=(index of the regexp that matched)
2344    debugCount = 0
2345    for r in wordRegexps:
2346       for match in re.finditer(maybe_bytes(r,text),text):
2347          matches[(match.start(),match.end())] = debugCount
2348       debugCount += 1
2349    i = 0 ; r = []
2350    def cmpFunc(a,b):
2351       (s1,e1),(s2,e2) = a,b
2352       if s1<s2: return -1
2353       if s1>s2: return 1
2354       if e1>e2: return -1
2355       if e1<e2: return 1
2356       return 0
2357    for start,end in sorted(list(matches.keys()),cmpFunc):
2358       if start<i: continue # overlap??
2359       r.append(text[i:start])
2360       if start==end: m = "&nbsp;"
2361       else: m = text[start:end].replace(maybe_bytes("&",text),maybe_bytes("&amp;",text)).replace(maybe_bytes("<",text),maybe_bytes("&lt;",text))
2362       try: rt = rubyList.pop()
2363       except: rt = "ERROR" # we've lost synchronisation
2364       if debug: title = as_utf8(" title=")+as_utf8(str(matches[(start,end)]))
2365       else: title = as_utf8("")
2366       r.append(as_utf8("<ruby")+title+as_utf8("><rb>")+m+as_utf8("</rb><rt>")+rt+as_utf8("</rt></ruby>"))
2367       i = end
2368    r.append(text[i:])
2369    while rubyList: # oops, lost synchronisation the other way (TODO: show this per-paragraph? but don't call eSpeak too many times if processing many short paragraphs)
2370       r.append(as_utf8("<ruby><rb>ERROR</rb><rt>")+rubyList.pop()+as_utf8("</rt></ruby>"))
2371    out = as_utf8("").join(r)
2372    if not out.endswith(as_utf8("\n")): out += as_utf8("\n")
2373    getBuf(sys.stdout).write(out)
2374
2375 def pipeThroughEspeak(inpt):
2376    "Writes inpt to espeak -q -x (in chunks if necessary) and returns the result"
2377    assert type(inpt)==bytes
2378    bufsize = 8192 # careful not to set this too big, as the OS might limit it (TODO can we check?)
2379    ret = []
2380    while len(inpt) > bufsize:
2381       splitAt = inpt.rfind('\n',0,bufsize)+1
2382       if not splitAt: # no newline, try to split on space
2383          splitAt = inpt.rfind(' ',0,bufsize)+1
2384          if not splitAt:
2385             sys.stderr.write("Note: had to split eSpeak input and couldn't find a newline or space to do it on\n")
2386             splitAt = bufsize
2387       response = pipeThroughEspeak(inpt[:splitAt])
2388       if not '\n' in response.rstrip() and 'command' in response: return response.strip() # 'bad cmd' / 'cmd not found'
2389       ret.append(response) ; inpt=inpt[splitAt:]
2390    try: w,r=os.popen4("espeak -q -x",bufsize=bufsize) # Python 2
2391    except AttributeError: # Python 3
2392       import subprocess
2393       proc=subprocess.Popen(['espeak','-q','-x'],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
2394       w = proc.stdin
2395       r = None
2396    if r:
2397       getBuf(w).write(inpt) ; w.close()
2398       r = getBuf(r).read()
2399    else: # Python 3
2400       w.write(inpt)
2401       out,err=proc.communicate()
2402       r = as_utf8("")
2403       if out: r += out
2404       if err: r += err
2405    return as_utf8("\n").join(ret) + r
2406
2407 def espeak_version_line(): return os.popen("espeak -h 2>&1").read().strip().split("\n")[0]
2408
2409 def writeFormatHeader(format):
2410    "Writes a header for 'format' when outputting in all formats.  Assumes the output MIGHT end up being more than one line."
2411    global writeFormatHeader_called
2412    if writeFormatHeader_called: print("")
2413    print(format)
2414    print('-'*len(format))
2415    writeFormatHeader_called = True
2416 writeFormatHeader_called = False
2417
2418 def mainopt_check_variants(i):
2419    # undocumented (won't appear in help text)
2420    groups = {}
2421    for k,v in lexFormats['espeak'].items():
2422       if type(k)==str:
2423          intV = int(v)
2424          if not intV in consonants:
2425             groups.setdefault(intV,[]).append((v,k))
2426    i = groups.items() ; i.sort()
2427    for k,v in i:
2428       if len(v)==1: continue
2429       v.sort()
2430       while True:
2431          print("Group "+str(k))
2432          es = os.popen("espeak -x","w")
2433          getBuf(es).write(as_utf8('\n').join([markup_inline_word("espeak",w) for _,w in v]))
2434          del es
2435          if not int(str(input("Again? 1/0: "))): break
2436
2437 def mainopt_check_for_similar_formats(i):
2438    # undocumented (won't appear in help text)
2439    items = lexFormats.items() ; r = []
2440    while items:
2441       k1,dic1 = items[0]
2442       for k2,dic2 in items[1:]:
2443          diff = 0
2444          for kk,vv in dic1.items():
2445             if not type(kk)==int: continue
2446             if kk==syllable_separator: continue
2447             if not dic2.get(kk,"!"+vv)==vv: diff += 1
2448          r.append((diff,k1,k2))
2449       items = items[1:]
2450    r.sort() ; had = set()
2451    for diffs,format1,format2 in r:
2452       if format1 in had and format2 in had: continue
2453       had.add(format1) ; had.add(format2)
2454       if "names" in had: break
2455       print(str(diffs)+" phoneme differences between "+format1+" and "+format2)
2456
2457 def festival_group_stress(pronunc):
2458    "Special-case cleanup_func for the Festival format"
2459    # TODO: do we ever need to add extra consonants to the
2460    # previous group instead of the next group?  (not sure
2461    # what difference it makes to the synthesis, but it
2462    # might make the entry a bit more readable)
2463    groups = [] ; thisGroup = [[],'0',False] # phon,stress,complete
2464    for phon in pronunc.split():
2465       if phon in ['0','1','2']:
2466          if groups and phon >= groups[-1][1]:
2467             groups[-1][1]=phon
2468          continue
2469       thisGroup[0].append(phon)
2470       if phon[:1] in 'aeiou@':
2471          thisGroup[2]=True
2472          groups.append(thisGroup)
2473          thisGroup = [[],'0',False]
2474    if thisGroup[0]: groups.append(thisGroup)
2475    if len(groups)>=2 and not groups[-1][2]:
2476       groups[-2][0] += groups[-1][0]
2477       del groups[-1]
2478    return "("+' '.join(("(("+' '.join(g[0])+') '+g[1]+")") for g in groups)+")"
2479
2480 def mainopt_convert(i):
2481    """*<from-format> <to-format>
2482 Convert a user lexicon (generally from its default filename; if this cannot be found then lexconvert will tell you what it should be).
2483 E.g.: python lexconvert.py --convert festival cepstral"""
2484    fromFormat = sys.argv[i+1]
2485    toFormat = sys.argv[i+2]
2486    if fromFormat==toFormat: return "Cannot convert a lexicon to its own format (that could result in it being truncated)"
2487    if toFormat=="mac-uk": return "Cannot permanently save a Mac-UK lexicon; please use the --mac-uk option to read text"
2488    if toFormat=="example": return "Cannot overwrite the built-in example lexicon"
2489    for f in [fromFormat,toFormat]:
2490       if not f in lexFormats: return "No such format "+repr(f)+" (use --formats to see a list of formats)"
2491    try:
2492       fname=getSetting(toFormat,"lex_filename")
2493       getSetting(toFormat,"lex_entry_format") # convert_user_lexicon will need this
2494    except KeyError: fname = None
2495    if not fname: return "Write support for lexicons of format '%s' not yet implemented (need at least lex_filename and lex_entry_format); try using --phones or --phones2phones options instead" % (toFormat,)
2496    if toFormat=="espeak":
2497       assert fname=="en_extra", "If you changed eSpeak's lex_filename in the table you also need to change the code below"
2498       if os.system("mv en_extra en_extra~ && (grep \" // \" en_extra~ || true) > en_extra"): sys.stderr.write("Warning: en_extra not found, making a new one\n(espeak compile will probably fail in this directory)\n") # otherwise keep the commented entries, so can incrementally update the user lexicon only
2499       outFile=open(fname,"a")
2500    else:
2501       l = 0
2502       try:
2503          f = open(fname)
2504          l = getBuf(f).read()
2505          del f
2506       except: pass
2507       assert not l, "File "+replHome(fname)+" already exists and is not empty; are you sure you want to overwrite it?  (Delete it first if so)" # (if you run with python -O then this is ignored, as are some other checks so be careful)
2508       outFile=open(fname,"w")
2509    print ("Writing %s lexicon entries to %s file %s" % (fromFormat,toFormat,fname))
2510    try: convert_user_lexicon(fromFormat,toFormat,outFile)
2511    except Message:
2512      print (" - error, deleting "+fname)
2513      os.remove(fname) ; raise
2514
2515 def mainopt_festival_dictionary_to_espeak(i):
2516    """<location>
2517 Convert the Festival Oxford Advanced Learners Dictionary (OALD) pronunciation lexicon to eSpeak.
2518 You need to specify the location of the OALD file in <location>,
2519 e.g. for Debian festlex-oald package: python lexconvert.py --festival-dictionary-to-espeak /usr/share/festival/dicts/oald/all.scm
2520 or if you can't install the Debian package, try downloading http://ftp.debian.org/debian/pool/non-free/f/festlex-oald/festlex-oald_1.4.0.orig.tar.gz, unpack it into /tmp, and do: python lexconvert.py --festival-dictionary-to-espeak /tmp/festival/lib/dicts/oald/oald-0.4.out
2521 In all cases you need to cd to the eSpeak source directory before running this.  en_extra will be overwritten.  Converter will also read your ~/.festivalrc if it exists.  (You can later incrementally update from ~/.festivalrc using the --convert option; the entries from the system dictionary will not be overwritten in this case.)  Specify --without-check to bypass checking the existing eSpeak pronunciation for OALD entries (much faster, but makes a larger file and in some cases compromises the pronunciation quality)."""
2522    try: festival_location=sys.argv[i+1]
2523    except IndexError: return "Error: --festival-dictionary-to-espeak must be followed by the location of the festival OALD file (see help text)"
2524    try: open(festival_location)
2525    except: return "Error: The specified OALD location '"+festival_location+"' could not be opened"
2526    try: open("en_list")
2527    except: return "Error: en_list could not be opened (did you remember to cd to the eSpeak dictsource directory first?"
2528    convert_system_festival_dictionary_to_espeak(festival_location,not '--without-check' in sys.argv,not os.system("test -e ~/.festivalrc"))
2529
2530 def mainopt_syllables(i):
2531    """[<words>]
2532 Attempt to break 'words' into syllables for music lyrics (uses espeak to determine how many syllables are needed)"""
2533    # As explained on mainopt_ruby's help text, espeak -x output can't be relied on to always put a space between every input word.  Rather than try to guess what espeak is going to do, here we simply put a newline after every input word instead.  This might affect eSpeak's output (so not recommended for mainopt_ruby), but it should be OK for just counting the syllables.  (Also, the assumption that the input words have been taken from song lyrics usefully rules out certain awkward punctuation cases.)
2534    for txt in getInputText(i+1,"word(s)",'maybe'):
2535       words=txt.split()
2536       response = pipeThroughEspeak(as_utf8('\n').join(as_utf8(w) for w in words).replace(as_utf8("!"),as_utf8("")).replace(as_utf8(":"),as_utf8("")).replace(as_utf8("."),as_utf8("")))
2537       if not as_utf8('\n') in response.rstrip() and as_utf8('command') in response: return response.strip() # 'bad cmd' / 'cmd not found'
2538       rrr = response.split(as_utf8("\n"))
2539       print (" ".join([hyphenate(word,sylcount(convert(line,"espeak","example"))) for word,line in zip(words,filter(lambda x:x,rrr))]))
2540       sys.stdout.flush() # in case piped
2541
2542 def wordSeparator(format):
2543    """Returns the effective word separator of format (remembering that it defaults to same as phoneme_separator"""
2544    return checkSetting(format,"word_separator",checkSetting(format,"phoneme_separator"," "))
2545
2546 def mainopt_phones2phones(i):
2547    """*<format1> <format2> [<phonemes in format1>]
2548 Perform a one-off conversion of phonemes from format1 to format2 (format2 can be 'all' if you want)""" # If format1 is 'example' and you don't specify phonemes, we take the words from the example lexicon.  But don't say that in the help string because it might confuse the issue about phonemes being optional on the command line and prompted for if not specified and stdin is not piped in all formats other than 'example'.
2549    format1,format2 = sys.argv[i+1],sys.argv[i+2]
2550    if not format1 in lexFormats: return "No such format "+repr(format1)+" (use --formats to see a list of formats)"
2551    if not format2 in lexFormats and not format2=="all": return "No such format "+repr(format2)+" (use --formats to see a list of formats)"
2552    if format1=="example" and len(sys.argv)<=i+3:
2553      if stdin_is_terminal(): txt=""
2554      else: txt=getBuf(sys.stdin).read() # and it might still be ""
2555      if txt: parseIntoWordsAndClauses(format1,txt)
2556      else: clauses=[[x[1]] for x in getSetting('example','lex_read_function')()]
2557    else: clauses = parseIntoWordsAndClauses(format1,getInputText(i+3,"phonemes in "+format1+" format"))
2558    if format2=="all": formats = sorted(k for k in lexFormats.keys() if not k=="example")
2559    else: formats = [format2]
2560    for format2 in formats:
2561      if len(formats)>1: writeFormatHeader(format2)
2562      getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_header")))
2563      output_clauses(format2,convert(clauses,format1,format2))
2564      getBuf(sys.stdout).write(as_utf8(checkSetting(format2,"inline_footer"))) ; print("")
2565
2566 def parseIntoWordsAndClauses(format,phones):
2567    "Returns list of clauses, each of which is a list of words, assuming 'phones' are in format 'format'"
2568    wordSep = checkSetting(format,"word_separator") # don't use wordSeparator() here - we're splitting, not joining, so we don't want it to default to phoneme_separator
2569    clauseSep = checkSetting(format,"clause_separator","\n")
2570    def s(sep):
2571       if sep==" ": return None # " " means ANY whitespace (TODO: document this?)
2572       else: return maybe_bytes(sep,phones)
2573    if clauseSep and type(clauseSep) in [bytes,unicode]:
2574       clauses = phones.split(s(clauseSep))
2575    else: clauses = [phones]
2576    for i in range(len(clauses)):
2577       if wordSep: clauses[i]=clauses[i].split(s(wordSep))
2578       else: clauses[i] = [clauses[i]]
2579       clauses[i] = list(filter(lambda x:x, clauses[i]))
2580    return list(filter(lambda x:x,clauses))
2581
2582 def mainopt_mac_uk(i):
2583    """<from-format> [<text>]
2584 Speak text in Mac OS 10.7+ British voices while using a lexicon converted in from <from-format>. As these voices do not have user-modifiable lexicons, lexconvert must binary-patch your system's master lexicon; this is at your own risk! (Superuser privileges are needed the first time. A backup of the system file is made, and all changes are restored on normal exit but if you force-quit then you might need to restore the backup manually. Text speaking needs to be under lexconvert's control because it usually has to change the input words to make them fit the available space in the binary lexicon.) By default the Daniel voice is used; Emily or Serena can be selected by setting the MACUK_VOICE environment variable."""
2585    # If you have xterm etc, then text will also be printed, with words from the altered lexicon underlined.
2586    assert sys.version_info[0]==2, "--mac-uk has not been tested with Python 3, I don't want to risk messing up your system files, please use Python 2"
2587    fromFormat = sys.argv[i+1]
2588    if not fromFormat in lexFormats: return "No such format "+repr(fromFormat)+" (use --formats to see a list of formats)"
2589    lex = get_macuk_lexicon(fromFormat)
2590    try:
2591       for line in getInputText(i+2,"text",True):
2592          m = MacBritish_System_Lexicon(line,os.environ.get("MACUK_VOICE","Daniel"))
2593          try: m.readWithLex(lex)
2594          finally: m.close()
2595    except KeyboardInterrupt:
2596       sys.stderr.write("Interrupted\n")
2597
2598 class Counter(object):
2599     "A simple class with two static members, count and subcount, for use by the consonant(), vowel() and other() functions"
2600     c=sc=0
2601 def other():
2602     "Used by Phonemes() when creating something that is neither a vowel nor a consonant, e.g. a stress mark"
2603     Counter.c += 1 ; Counter.sc=0 ; return Counter.c
2604 consonants = set() ; mainVowels = set()
2605 def consonant():
2606     "Used by Phonemes() when creating a consonant"
2607     r = other() ; consonants.add(r) ; return r
2608 def vowel():
2609     "Used by Phonemes() when creating a vowel"
2610     r = other() ; mainVowels.add(r) ; return r
2611 def opt_vowel():
2612     "Used by Phonemes() when creating an optional vowel (one that has no warning issued if some format doesn't support it)"
2613     return other()
2614 def variant():
2615     "Used by Phonemes() when creating a variant of the just-defined vowel/consonant/etc"
2616     Counter.sc += 1
2617     while str(Counter.sc).endswith('0'): Counter.sc += 1
2618     return 0, float('%d.%d' % (Counter.c,Counter.sc))
2619     # the 0 is so we can say _, name = variant()
2620     # so as to get some extra indentation
2621
2622 def ifset(var,a,b=""):
2623    "Checks the environment variable var; if it is set (non-empty), return a, otherwise return b.  Used in LexFormats to create tables with variations set by the environment."
2624    import os
2625    if os.environ.get(var,""): return a
2626    else: return b
2627
2628 def speakjet(symbol,opcode):
2629    "Special-case function for the Speakjet table"
2630    assert type(opcode)==int
2631    if ifset('SPEAKJET_BINARY',1):
2632       assert not ifset('SPEAKJET_SYM',1), "Cannot set both SPEAKJET_SYM and SPEAKJET_BINARY"
2633       return chr(opcode)
2634    else: return ifset('SPEAKJET_SYM',symbol,str(opcode))
2635
2636 def makeDic(doc,*args,**kwargs):
2637     "Make a dictionary with a doc string, default-bidirectional mappings and extra settings; see LexFormats for how this is used."
2638     assert type(doc)==str, "doc must be a string"
2639     d = {} ; duplicates = set()
2640     for a in args:
2641         assert type(a)==tuple and (len(a)==2 or len(a)==3)
2642         k=a[0]
2643         if k in d: duplicates.add(k)
2644         v=a[1]
2645         assert (type(k) in [bytes,unicode] and type(v) in [int,float]) or (type(v) in [bytes,unicode] and type(k) in [int,float]), "Wrong types "+repr(a)+" (did you forget a _, before calling variant() or something?)"
2646         d[k] = v
2647         if type(k)==unicode: d[as_utf8(k)] = v
2648         if len(a)==3: bidir=a[2]
2649         else: bidir=True
2650         if bidir:
2651             # (k,v,True) = both (k,v) and (v,k)
2652             if v in d: duplicates.add(v)
2653             d[v] = k
2654     assert not duplicates, " Duplicate key(s) in "+repr(doc)+": "+", ".join((repr(dup)+"".join(" (="+g+")" for g,val in globals().items() if val==dup)) for dup in sorted(list(duplicates)))+". Did you forget a ,False to suppress bidirectional mapping?" # by the way, Python does not detect duplicate keys in {...} notation - it just lets you overwrite
2655     missing = [l for l in (list(consonants)+list(mainVowels)) if not l in d]
2656     # did_approx = False
2657     if missing and 'approximate_missing' in kwargs:
2658       for miss,approxTo in [
2659           # TODO: put this table somewhere else?
2660           # (If the thing on the right is just 1 item, we could make the thing on the left a variant of it.  But that might not be a good idea unless they're really very close, since if it's a variant then the substitution is done without warning even if approximate_missing is not set.)
2661           (a_as_in_ago, [u_as_in_but]),
2662           (a_as_in_air, [e_as_in_them,r]),
2663           (ear, [e_as_in_eat,u_as_in_but]),
2664           (oor_as_in_poor, [close_to_or]), # TODO: ,r?
2665           (a_as_in_ah,[a_as_in_apple]), # this seems to be missing in some American voices (DecTalk, Keynote, SAM); TODO: is this the best approximation we can do?
2666           (a_as_in_apple,[a_as_in_ah]), # the reverse of the above, for Devanagari
2667           (o_as_in_orange,[oo_as_in_food]),(o_as_in_go,[oo_as_in_food]),(oy_as_in_toy,[oo_as_in_food,i_as_in_it]),(o_as_in_now,[a_as_in_ah, w]),(e_as_in_herd,[u_as_in_but,u_as_in_but]),(ar_as_in_year,[u_as_in_but,u_as_in_but]),(eye,[a_as_in_ah,y]),(th_as_in_think,[th_as_in_them]), # (Devanagari: is this really the best we can do?)
2668           ]:
2669         if miss in missing and all(x in d for x in approxTo):
2670           d[miss]=maybe_bytes(kwargs.get("phoneme_separator"," "),d[approxTo[0]]).join(d[x] for x in approxTo)
2671           # did_approx = True
2672           missing.remove(miss)
2673     # if did_approx: doc="(approx.) "+doc # and see also the code in makeVariantDic.  Commenting out because this is misleading: the formats where we didn't do a did_approx might also contain approximations of some kind.  Incidentally there are some British English voices that need approximate_missing (e.g. Apollo 2)
2674     d[("settings","doc")] = doc
2675     if missing:
2676        import sys ; sys.stderr.write("WARNING: Some non-optional vowels/consonants are missing from "+repr(doc)+"\nThe following are missing: "+", ".join("/".join(g for g,val in globals().items() if val==m) for m in missing)+"\n")
2677     for k,v in kwargs.items(): d[('settings',k)] = v
2678     assert type(d.get(('settings','cleanup_regexps'),[]))==list, "cleanup_regexps must be a list" # not one tuple
2679     assert type(d.get(('settings','cvtOut_regexps'),[]))==list, "cvtOut_regexps must be a list" # not one tuple
2680     wsep = d.get(('settings','word_separator'),None)
2681     psep = d.get(('settings','phoneme_separator'),' ')
2682     if not wsep==None: assert not wsep in d, "word_separator duplicates with a key in "+repr(doc)
2683     if not psep==None: assert not psep in d, "phoneme_separator duplicates with a key (did you forget to change the default, or to add a ,False somewhere?) in "+repr(doc)
2684     global lastDictionaryMade ; lastDictionaryMade = d
2685     return d
2686 def makeVariantDic(doc,*args,**kwargs):
2687     "Like makeDic but create a new 'variant' version of the last-made dictionary, modifying some phonemes and settings (and giving it a new doc string) but keeping everything else the same.  Any list settings (e.g. cleanup_regexps) are ADDED to by the variant; other settings and phonemes are REPLACED if they are specified in the variant.  If you don't want subsequent variants to inherit the changes made by this variant, add noInherit=True to the keyword args."
2688     global lastDictionaryMade
2689     ldmOld = lastDictionaryMade
2690     toUpdate = lastDictionaryMade.copy()
2691     global mainVowels,consonants
2692     oldV,oldC = mainVowels,consonants
2693     mainVowels,consonants = [],[] # so makeDic doesn't complain if some vowels/consonants are missing
2694     if 'noInherit' in kwargs:
2695        noInherit = kwargs['noInherit']
2696        del kwargs['noInherit']
2697     else: noInherit = False
2698     d = makeDic(doc,*args,**kwargs)
2699     if noInherit: lastDictionaryMade = ldmOld
2700     mainVowels,consonants = oldV,oldC
2701     # if toUpdate[("settings","doc")].startswith("(approx.) ") and not d[("settings","doc")].startswith("(approx.) "): d[("settings","doc")]="(approx.) "+d[("settings","doc")] # TODO: always?
2702     for k,v in toUpdate.items():
2703        if type(v)==list and k in d: d[k] = v+d[k]
2704     toUpdate.update(d) ; return toUpdate
2705 def getSetting(formatName,settingName):
2706   "Gets a setting from lexFormats, exception if not there"
2707   return lexFormats[formatName][('settings',settingName)]
2708 def checkSetting(formatName,settingName,default=""):
2709   "Gets a setting from lexFormats, default if not there"
2710   return lexFormats[formatName].get(('settings',settingName),default)
2711
2712 import sys,re,os
2713 try: from subprocess import getoutput
2714 except: from commands import getoutput # Python 2
2715 try: bytes # Python 3 and newer Python 2
2716 except: bytes = str # older Python 2
2717 try: unicode # Python 2
2718 except: # Python 3
2719    unicode,unichr,xrange = str,chr,range
2720    def chr(x): return bytes([x])
2721    _builtin_sorted = sorted
2722    from functools import cmp_to_key
2723    def sorted(l,theCmp=None):
2724       if theCmp:
2725          return _builtin_sorted(l,key=cmp_to_key(theCmp))
2726       else: return _builtin_sorted(l)
2727    assert sys.version_info[1] > 4, "lexconvert cannot run on Python 3.4 due to lack of byte-string percent formatting (PEP 461).  Please use Python 3.5+ or stick with Python 2."
2728 def getBuf(f):
2729    "Return a buffer to which bytes may be written, for Python 2 and 3 compatibility"
2730    try: return f.buffer # Python 3
2731    except AttributeError: return f # Python 2
2732
2733 cached_sourceName,cached_destName,cached_dict = None,None,None
2734 def make_dictionary(sourceName,destName):
2735     "Uses lexFormats to make a mapping dictionary from a particular source format to a particular dest format, and also sets module variables for that particular conversion (TODO: put those module vars into an object in case someone wants to use this code in a multithreaded server)"
2736     global cached_sourceName,cached_destName,cached_dict
2737     if (sourceName,destName) == (cached_sourceName,cached_destName): return cached_dict
2738     source = lexFormats[sourceName]
2739     dest = lexFormats[destName]
2740     d = {}
2741     global dest_consonants ; dest_consonants = set()
2742     global dest_syllable_sep ; dest_syllable_sep = dest.get(syllable_separator,"")
2743     global implicit_vowel_before_NL
2744     implicit_vowel_before_NL = None
2745     for k,v in source.items():
2746       if type(k)==tuple: continue # settings
2747       if type(v) in [bytes,unicode]: continue # (num->string entries are for converting IN to source; we want the string->num entries for converting out)
2748       if not v in dest: v = int(v) # (try the main version of a variant)
2749       if not v in dest: continue # (haven't got it - will have to ignore or break into parts)
2750       assert type(k) in [bytes,unicode]
2751       d[k] = dest[v]
2752       if int(v) in consonants: dest_consonants.add(d[k])
2753       if int(v)==e_as_in_herd and (not implicit_vowel_before_NL or v==int(v)): # TODO: or u_as_in_but ?  used by festival and some other synths before words ending 'n' or 'l' (see usage of implicit_vowel_before_NL later)
2754         implicit_vowel_before_NL = d[k]
2755       d[as_utf8(k)] = d[k]
2756       try: d[as_unicode(k)] = d[k]
2757       except UnicodeDecodeError: pass
2758     try:
2759        if any(type(v)==unicode for v in d.values()): d,dest_consonants=dict((k,as_unicode(v)) for k,v in d.items()),set(as_unicode(v) for v in dest_consonants) # Python 2: if ANY dest are Unicode, make them ALL Unicode
2760     except UnicodeDecodeError: d,dest_consonants=dict((k,as_utf8(v)) for k,v in d.items()),set(as_utf8(v) for v in dest_consonants) # ... or make them ALL byte-strings if some were binary and not readable as UTF-8
2761     cached_sourceName,cached_destName,cached_dict=sourceName,destName,d
2762     return d
2763
2764 warnedAlready = set()
2765 def convert(pronunc,source,dest):
2766     "Convert pronunc from source to dest.  pronunc can be a string or a list; if a list then we'll recurse on each of the list elements and return a new list (this is meant for batch-converting clauses etc)"
2767     assert type(pronunc) in [bytes,unicode,list], type(pronunc)
2768     if source==dest: return pronunc # essential for --try experimentation with codes not yet supported by lexconvert
2769     if type(pronunc)==list: return [convert(p,source,dest) for p in pronunc]
2770     func = checkSetting(source,'cvtOut_func')
2771     if func: pronunc=func(pronunc)
2772     for s,r in checkSetting(source,'cvtOut_regexps'):
2773         pronunc=re.sub(maybe_bytes(s,pronunc),maybe_bytes(r,pronunc),pronunc)
2774     ret = [] ; toAddAfter = None
2775     dictionary = make_dictionary(source,dest)
2776     maxLen=max(len(l) for l in dictionary.keys())
2777     debugInfo=""
2778     separator = checkSetting(dest,'phoneme_separator',' ')
2779     safe_to_drop = checkSetting(source,"safe_to_drop_characters")
2780     while pronunc:
2781         for lettersToTry in range(maxLen,-1,-1):
2782             if not lettersToTry:
2783               if safe_to_drop==True: pass
2784               elif (not safe_to_drop) or not pronunc[:1] in maybe_bytes(safe_to_drop,pronunc) and not (pronunc[:1],debugInfo) in warnedAlready:
2785                  warnedAlready.add((pronunc[:1],debugInfo))
2786                  sys.stderr.write("Warning: ignoring "+source+" character "+repr(pronunc[:1])+debugInfo+" (unsupported in "+dest+")\n")
2787               pronunc=pronunc[1:] # ignore
2788             elif pronunc[:lettersToTry] in dictionary:
2789                 debugInfo=" after "+as_printable(pronunc[:lettersToTry])
2790                 toAdd=dictionary[pronunc[:lettersToTry]]
2791                 assert type(toAdd) in [bytes,unicode], type(toAdd)
2792                 isStressMark=(toAdd and toAdd in [maybe_bytes(lexFormats[dest].get(primary_stress,''),toAdd),maybe_bytes(lexFormats[dest].get(secondary_stress,''),toAdd)])
2793                 if toAdd==maybe_bytes(lexFormats[dest].get(syllable_separator,''),toAdd): pass
2794                 elif isStressMark and not checkSetting(dest,"stress_comes_before_vowel"):
2795                     if checkSetting(source,"stress_comes_before_vowel"): toAdd, toAddAfter = maybe_bytes("",toAdd),toAdd # move stress marks from before vowel to after
2796                     else: # stress is already after, but:
2797                         # With Cepstral synth (and kana-approx), stress mark should be placed EXACTLY after the vowel and not any later.  Might as well do this for others also.
2798                         r=len(ret)-1
2799                         while ret[r] in dest_consonants or ret[r].endswith(maybe_bytes("*added",ret[r])): r -= 1 # (if that raises IndexError then the input had a stress mark before any vowel) ("*added" condition is there so that implicit vowels don't get the stress)
2800                         ret.insert(r+1,toAdd) ; toAdd=maybe_bytes("",toAdd)
2801                 elif isStressMark and not checkSetting(source,"stress_comes_before_vowel"): # it's a stress mark that should be moved from after the vowel to before it
2802                     i=len(ret)
2803                     while i and (ret[i-1] in dest_consonants or ret[i-1].endswith(maybe_bytes("*added",ret[i-1]))): i -= 1
2804                     if i: i-=1
2805                     ret.insert(i,toAdd)
2806                     if dest_syllable_sep: ret.append(maybe_bytes(dest_syllable_sep,toAdd)) # (TODO: this assumes stress marks are at end of syllable rather than immediately after vowel; correct for Festival; check others; probably a harmless assumption though; mac-uk is better with syllable separators although espeak basically ignores them)
2807                     toAdd = maybe_bytes("",toAdd)
2808                 # attempt to sort out the festival dictionary's (and other's) implicit_vowel_before_NL
2809                 elif implicit_vowel_before_NL and ret and ret[-1] and toAdd in [maybe_bytes('n',toAdd),maybe_bytes('l',toAdd)] and ret[-1] in dest_consonants: ret.append(maybe_bytes(implicit_vowel_before_NL,toAdd)+maybe_bytes('*added',toAdd))
2810                 elif len(ret)>2 and ret[-2].endswith(maybe_bytes('*added',ret[-2])) and toAdd and not toAdd in dest_consonants and not toAdd==dest_syllable_sep: del ret[-2]
2811                 if toAdd:
2812                     # Add it, but if toAdd is multiple phonemes, try to put toAddAfter after the FIRST phoneme
2813                     if separator: toAddList=toAdd.split(separator)
2814                     else: toAddList = [toAdd] # TODO: won't work for formats that don't have a phoneme separator (doesn't really matter for eSpeak though)
2815                     ret.append(toAddList[0])
2816                     if toAddAfter and not toAddList[0] in dest_consonants:
2817                         ret.append(toAddAfter)
2818                         toAddAfter=None
2819                     ret += toAddList[1:]
2820                 pronunc=pronunc[lettersToTry:]
2821                 break
2822     if toAddAfter: ret.append(toAddAfter)
2823     if ret and ret[-1]==dest_syllable_sep: del ret[-1] # spurious syllable separator at end
2824     if not ret: ret = ""
2825     else: ret=maybe_bytes(separator,ret[0]).join(ret).replace(maybe_bytes('*added',ret[0]),maybe_bytes('',ret[0]))
2826     for s,r in checkSetting(dest,'cleanup_regexps'):
2827       ret=re.sub(maybe_bytes(s,ret),maybe_bytes(r,ret),ret)
2828     func = checkSetting(dest,'cleanup_func')
2829     if func: return func(ret)
2830     else: return ret
2831
2832 def unicode_preprocess(pronunc):
2833    "Special-case cvtOut_func for unicode-ipa etc: tries to catch \\uNNNN etc"
2834    if maybe_bytes("\\u",pronunc) in pronunc and not maybe_bytes('"',pronunc) in pronunc: # maybe \uNNNN copied from Gecko on X11, can just evaluate it to get the unicode
2835       # (NB make sure to quote the \'s if pasing in on the command line)
2836       try: pronunc=eval('u"'+pronunc+'"')
2837       except: pass
2838    else: # see if it makes sense as utf-8
2839       try: pronunc = pronunc.decode('utf-8')
2840       except: pass
2841    return pronunc
2842
2843 def ascii_braille_to_unicode(a):
2844   "Special-case cleanup_func for braille-ipa (set by braille-ipa if BRAILLE_UNICODE is set).  Converts Braille ASCII to Unicode dot patterns."
2845   d=dict(zip(list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)="),[unichr(c) for c in range(0x2800,0x2840)]))
2846   return u''.join(d.get(c,c) for c in list(a))
2847 def unicode_to_ascii_braille(u):
2848   d=dict(zip([unichr(c) for c in range(0x2800,0x2840)],list(" A1B'K2L@CIF/MSP\"E3H9O6R^DJG>NTQ,*5<-U8V.%[$+X!&;:4\\0Z7(_?W]#Y)=")))
2849   r=''.join(d.get(c,c) for c in list(as_unicode(u)))
2850   if r.startswith(",7") and r.endswith("7'"): r=r[2:-2]
2851   return r
2852
2853 def hiragana_to_katakana(u):
2854    "Special-case cleanup_func for kana-approx; converts all hiragana characters in unicode string 'u' into katakana if KANA_TYPE is set to anything beginning with a 'k'"
2855    assert type(u)==unicode
2856    if not os.environ.get("KANA_TYPE","").lower().startswith("k"): return u
2857    u = list(u)
2858    for i in xrange(len(u)):
2859       if 0x3041 <= ord(u[i]) <= 0x3096:
2860          u[i]=unichr(ord(u[i])+0x60)
2861    return u"".join(u)
2862
2863 def espeak_probably_right_already(existing_pronunc,new_pronunc):
2864     """Used by convert_system_festival_dictionary_to_espeak to compare a "new" pronunciation with eSpeak's existing pronunciation.  As the transcription from OALD to eSpeak is only approximate, it could be that our new pronunciation is not identical to the existing one but the existing one is actually correct; try to detect when this happens by checking if the pronunciations are the same after some simplifications."""
2865     if existing_pronunc==new_pronunc: return True
2866     def simplify(pronunc): return \
2867         pronunc.replace(maybe_bytes(";",pronunc),maybe_bytes("",pronunc)).replace(maybe_bytes("%",pronunc),maybe_bytes("",pronunc)) \
2868         .replace(maybe_bytes("a2",pronunc),maybe_bytes("@",pronunc)) \
2869         .replace(maybe_bytes("3",pronunc),maybe_bytes("@",pronunc)) \
2870         .replace(maybe_bytes("L",pronunc),maybe_bytes("l",pronunc)) \
2871         .replace(maybe_bytes("I2",pronunc),maybe_bytes("i:",pronunc)) \
2872         .replace(maybe_bytes("I",pronunc),maybe_bytes("i:",pronunc)).replace(maybe_bytes("i@",pronunc),maybe_bytes("i:@",pronunc)) \
2873         .replace(maybe_bytes(",",pronunc),maybe_bytes("",pronunc)) \
2874         .replace(maybe_bytes("s",pronunc),maybe_bytes("z",pronunc)) \
2875         .replace(maybe_bytes("aa",pronunc),maybe_bytes("A:",pronunc)) \
2876         .replace(maybe_bytes("A@",pronunc),maybe_bytes("A:",pronunc)) \
2877         .replace(maybe_bytes("O@",pronunc),maybe_bytes("O:",pronunc)) \
2878         .replace(maybe_bytes("o@",pronunc),maybe_bytes("O:",pronunc)) \
2879         .replace(maybe_bytes("r-",pronunc),maybe_bytes("r",pronunc))
2880     # TODO: rewrite @ to 3 whenever not followed by a vowel?
2881     if as_printable(simplify(existing_pronunc))==as_printable(simplify(new_pronunc)): return True # almost the same, and festival @/a2 etc seems to be a bit ambiguous so leave it alone
2882
2883 def parse_festival_dict(festival_location):
2884     "For OALD; yields word,part-of-speech,pronunciation"
2885     ret = []
2886     for line in open(festival_location):
2887         line=line.strip()
2888         if "((pos" in line: line=line[:line.index("((pos")]
2889         if line.startswith('( "'): line=line[3:]
2890         line=line.replace('"','').replace('(','').replace(')','')
2891         try:
2892             word, pos, pronunc = line.split(None,2)
2893         except ValueError: continue # malformed line
2894         if pos not in ['n','v','a','cc','dt','in','j','k','nil','prp','uh']: continue # two or more words
2895         yield (word.lower(), pos, pronunc)
2896
2897 class Message(Exception): pass
2898 def convert_system_festival_dictionary_to_espeak(festival_location,check_existing_pronunciation,add_user_dictionary_also):
2899     "See mainopt_festival_dictionary_to_espeak"
2900     os.system("mv en_extra en_extra~") # start with blank 'extra' dictionary
2901     if check_existing_pronunciation: os.system("espeak --compile=en") # so that the pronunciation we're checking against is not influenced by a previous version of en_extra
2902     outFile=open("en_extra","w")
2903     print ("Reading dictionary lists")
2904     wordDic = {} ; ambiguous = {}
2905     el = open("en_list")
2906     for line in filter(lambda x:x.split() and not re.match(maybe_bytes(r'^[a-z]* *\$',x),x),getBuf(el).read().split(as_utf8('\n'))): ambiguous[line.split()[0]]=ambiguous[line.split()[0]+as_utf8('s')]=True # this stops the code below from overriding anything already in espeak's en_list.  If taking out then you need to think carefully about words like "a", "the" etc.
2907     for word,pos,pronunc in parse_festival_dict(festival_location):
2908         pronunc=pronunc.replace("i@ 0 @ 0","ii ou 2 ").replace("i@ 0 u 0","ii ou ") # (hack for OALD's "radio"/"video"/"stereo"/"embryo" etc)
2909         pronunc=pronunc.replace("0","") # 0's not necessary, and OALD sometimes puts them in wrong places, confusing the converter
2910         if word in ['mosquitoes']: continue # OALD bug (TODO: any others?)
2911         if word in wordDic and not wordDic[word]==(pronunc,pos):
2912             ambiguous[as_utf8(word)] = True
2913             del wordDic[word] # better not go there
2914         if not as_utf8(word) in ambiguous:
2915             wordDic[word] = (pronunc, pos)
2916     toDel = []
2917     if check_existing_pronunciation:
2918         print ("Checking existing pronunciation")
2919         proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
2920         wList = []
2921     progressCount=0 ; oldPercent=-1
2922     itemList = list(wordDic.items())
2923     # Make sure it's NOT sorted, to ensure eSpeak doesn't
2924     # cache pronunciation of previous word when add suffix
2925     # (which can subtly change eSpeak's pronunciation in
2926     # some versions of eSpeak, leading to
2927     # Python 2/3 differences as Python 3 sorts by default) :
2928     itemList.sort()
2929     i0,i1 = itemList[:int(len(itemList)/2)],itemList[int(len(itemList)/2):]
2930     itemList = []
2931     while i0 or i1:
2932        if i0: itemList.append(i0.pop())
2933        if i1: itemList.append(i1.pop())
2934     for word,(pronunc,pos) in itemList:
2935         if check_existing_pronunciation:
2936             percent = int(progressCount*100/len(wordDic))
2937             if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()
2938             oldPercent=percent
2939             progressCount += 1
2940         if not re.match("^[A-Za-z]*$",word): # (some versions of eSpeak also OK with "-", but not all)
2941             # contains special characters - better not go there
2942             toDel.append(word)
2943         elif word.startswith("plaque") or word in "friday saturday sunday tuesday thursday yesterday".split():
2944             # hack to accept eSpeak's pl'ak instead of pl'A:k - order was reversed in the March 2009 draft
2945             toDel.append(word)
2946         elif word[-1]=="s" and word[:-1] in wordDic:
2947             # unnecessary plural (espeak will pick up on them anyway)
2948             toDel.append(word)
2949         elif word.startswith("year") or "quarter" in word: toDel.append(word) # don't like festival's pronunciation of those (TODO: also 'memorial' why start with [m'I])
2950         elif check_existing_pronunciation:
2951             getBuf(proc).write(as_utf8(word)+as_utf8("\n"))
2952             proc.flush() # so the progress indicator works
2953             wList.append(word)
2954     if check_existing_pronunciation:
2955         proc.close() ; print("")
2956         oldPronDic = {}
2957         tp = open("/tmp/.pronunc")
2958         for k,v in zip(wList,getBuf(tp).read().split(as_utf8("\n"))): oldPronDic[k]=v.strip().replace(as_utf8(" "),as_utf8(""))
2959     for w in toDel: del wordDic[w]
2960     print ("Doing the conversion")
2961     lines_output = 0
2962     total_lines = 0
2963     not_output_because_ok = []
2964     items = list(wordDic.items()) ; items.sort() # necessary because of the hacks below which check for the presence of truncated versions of the word (want to have decided whether or not to output those truncated versions before reaching the hacks)
2965     for word,(pronunc,pos) in items:
2966         total_lines += 1
2967         new_e_pronunc = convert(pronunc,"festival","espeak")
2968         if new_e_pronunc.count("'")==2 and not '-' in word: new_e_pronunc=new_e_pronunc.replace("'",",",1) # if 2 primary accents then make the first one a secondary (except on hyphenated words)
2969         # TODO if not en-rp? - if (word.endswith("y") or word.endswith("ie")) and new_e_pronunc.endswith("i:"): new_e_pronunc=new_e_pronunc[:-2]+"I"
2970         unrelated_word = None
2971         if check_existing_pronunciation: espeakPronunc = oldPronDic.get(word,"")
2972         else: espeakPronunc = ""
2973         if word[-1]=='e' and word[:-1] in wordDic: unrelated_word, espeakPronunc = word[:-1],"" # hack: if word ends with 'e' and dropping the 'e' leaves a valid word that's also in the dictionary, we DON'T want to drop this word on the grounds that espeak already gets it right, because if we do then adding 's' to this word may cause espeak to add 's' to the OTHER word ('-es' rule).
2974         if espeak_probably_right_already(espeakPronunc,new_e_pronunc):
2975             not_output_because_ok.append(word)
2976             continue
2977         if not unrelated_word: lines_output += 1
2978         getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+as_utf8(new_e_pronunc)+as_utf8(" // from Festival's (")+as_utf8(pronunc)+as_utf8(")"))
2979         if espeakPronunc: getBuf(outFile).write(as_utf8(", not [[")+as_utf8(espeakPronunc)+as_utf8("]]"))
2980         elif unrelated_word: getBuf(outFile).write(as_utf8(" (here to stop espeak's affix rules getting confused by Festival's \"")+as_utf8(unrelated_word)+as_utf8("\")"))
2981         getBuf(outFile).write(as_utf8("\n"))
2982     print ("Corrected(?) %d entries out of %d" % (lines_output,total_lines))
2983     if add_user_dictionary_also: convert_user_lexicon("festival","espeak",outFile)
2984     outFile.close()
2985     os.system("espeak --compile=en")
2986     if not_output_because_ok:
2987       print ("Checking for unwanted side-effects of those corrections") # e.g. terrible as Terr + ible, inducing as in+Duce+ing
2988       proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
2989       progressCount = 0
2990       for w in not_output_because_ok:
2991           getBuf(proc).write(as_utf8(w)+as_utf8("\n")) ; proc.flush()
2992           percent = int(progressCount*100/len(not_output_because_ok))
2993           if not percent==oldPercent: sys.stdout.write(str(percent)+"%\r") ; sys.stdout.flush()
2994           oldPercent = percent
2995           progressCount += 1
2996       proc.close()
2997       outFile=open("en_extra","a") # append to it
2998       tp = open("/tmp/.pronunc")
2999       for word,pronunc in zip(not_output_because_ok,getBuf(tp).read().split(as_utf8("\n"))):
3000         pronunc = pronunc.strip().replace(as_utf8(" "),as_utf8(""))
3001         if not pronunc==oldPronDic[word] and not espeak_probably_right_already(oldPronDic[word],pronunc):
3002           getBuf(outFile).write(as_utf8(word)+as_utf8(" ")+oldPronDic[word]+as_utf8(" // (undo affix-side-effect from previous words that gave \"")+pronunc+as_utf8("\")\n"))
3003       outFile.close()
3004       os.system("espeak --compile=en")
3005     return not_output_because_ok
3006
3007 def read_user_lexicon(fromFormat):
3008     "Calls the appropriate lex_read_function, opening lex_filename first if supplied"
3009     readFunction = checkSetting(fromFormat,"lex_read_function")
3010     if not readFunction: raise Message("Reading from '%s' lexicon file not yet implemented (no lex_read_function); try using --phones or --phones2phones options instead" % (fromFormat,))
3011     try:
3012        lexFilename = getSetting(fromFormat,"lex_filename")
3013        if lexFilename==None: lexfile = None # e.g. the example lexicon
3014        else:
3015           lexfile = open(lexFilename)
3016           if not os.environ.get("LEXCONVERT_OMIT_READING_FROM",""): print ("Reading from "+lexFilename) # TODO: document LEXCONVERT_OMIT_READING_FROM (might be useful for the --mac-uk option)
3017     except KeyError: lexfile = None # lex_read_function without lex_filename is allowed, if the read function can take null param and fetch the lexicon itself
3018     except IOError: raise Message(fromFormat+"'s lexicon is expected to be in a file called "+replHome(lexFilename)+" which could not be read - please fix and try again")
3019     return readFunction(lexfile)
3020
3021 def replHome(fname):
3022    "Format fname for printing, substituting ~ for HOME if appropriate"
3023    h = os.environ.get('HOME','')
3024    if h and fname.startswith(h+os.sep):
3025       return "~"+fname[len(h):]
3026    else: return fname
3027
3028 def get_macuk_lexicon(fromFormat):
3029     "Converts lexicon from fromFormat and returns a list suitable for MacBritish_System_Lexicon's readWithLex"
3030     return [(word,convert(pronunc,fromFormat,"mac-uk")) for word, pronunc in read_user_lexicon(fromFormat)]
3031
3032 def as_utf8(s):
3033    if type(s)==unicode: return s.encode('utf-8')
3034    else: return s
3035 def as_unicode(s):
3036    if type(s)==unicode: return s
3037    else: return s.decode('utf-8')
3038 def maybe_bytes(s,i):
3039    "Python 2/3 compatibility: convert s to bytes if i is bytes"
3040    if type(i)==unicode: return s
3041    else: return as_utf8(s)
3042 def as_printable(s):
3043    if sys.version_info[0] < 3: return as_utf8(s)
3044    else: return as_utf8(s).decode('utf-8')
3045
3046 def convert_user_lexicon(fromFormat,toFormat,outFile):
3047     "See mainopt_convert"
3048     lex = read_user_lexicon(fromFormat)
3049     lex_header = checkSetting(toFormat,"lex_header")
3050     if type(lex_header) in [bytes,unicode]: getBuf(outFile).write(as_utf8(lex_header))
3051     else: lex_header(outFile)
3052     entryFormat=getSetting(toFormat,"lex_entry_format")
3053     wordCase=checkSetting(toFormat,"lex_word_case")
3054     for word, pronunc in lex:
3055         pronunc = as_utf8(convert(pronunc,fromFormat,toFormat))
3056         if wordCase=="upper": word=word.upper()
3057         elif wordCase=="lower": word=word.lower()
3058         getBuf(outFile).write(as_utf8(entryFormat) % (as_utf8(word),as_utf8(pronunc))) # will work in Python 3.6, but not in Python 3.4 (e.g. on jessie) which cannot do % on byte-strings
3059     footer = checkSetting(toFormat,"lex_footer")
3060     if type(footer) in [bytes,unicode]: getBuf(outFile).write(as_utf8(footer))
3061     else: footer(outFile)
3062
3063 def bbcMicro_partPhonemeCount(pronunc):
3064    """Returns the number of 'part phonemes' (at least that's what I'm calling them) for the BBC Micro phonemes in pronunc.  The *SPEAK command cannot take more than 117 part-phonemes at a time before saying "Line too long", and in some cases it takes less than that (I'm not sure why); 115 is a safer limit."""
3065    partCount = 0 ; pronunc0 = pronunc
3066    while pronunc:
3067       found = 0
3068       for p in ' ,AA,AE,AH,AI,AO,AW,AY,B,CH,CT,DH,DUX,D,EE,EH,ER,F,G,/H,IH,IX,IY,J,K,L,M,NX,N,OW,OL,OY,O,P,R,SH,S,TH,T,UH,/UL,/U,UW,UX,V,W,Y,ZH,Z'.split(','): # phonemes and space count, but pitch numbers do not count
3069          if pronunc.startswith(as_utf8(p)):
3070             partCount += {
3071                # *SPEAK can take 117 of most single-letter phonemes, or 116 (limited by the 232+6-character input limit) of most 2-letter phonemes
3072                'AW':2,'IY':2,'OW':2,'OL':2,'UW':2,'/UL':2, # *SPEAK can take 58 of these
3073                'DUX':3,'AY':3,'CH':3,'J':3,'OY':3, # *SPEAK can take 39 of these
3074                'CT':4, # *SPEAK can take 29 of these
3075             }.get(p,1)
3076             pronunc=pronunc[len(p):] ; found=1 ; break
3077       if not found:
3078          assert as_printable(pronunc[:1]) in '12345678',"Unrecognised BBC Micro phoneme at "+str(pronunc)+" in "+str(pronunc0)
3079          pronunc=pronunc[1:]
3080    return partCount
3081
3082 def markup_inline_word(format,pronunc):
3083     "Returns pronunc with any necessary markup for putting it in a text (using the inline_format setting)"
3084     pronunc = as_utf8(pronunc) # UTF-8 output - ok for pasting into Firefox etc *IF* the terminal/X11 understands utf-8 (otherwise redirect to a file, point the browser at it, and set encoding to utf-8, or try --convert'ing which will o/p HTML)
3085     format = checkSetting(format,"inline_format","%s")
3086     if type(format) in [bytes,unicode]:
3087        if type(format)==unicode: format=format.encode('utf-8') # see above
3088        return format % pronunc
3089     else: return format(pronunc)
3090 def markup_doubleTalk_word(pronunc):
3091    "Special-case function set as inline_format in doubletalk (checks environment variables for command code)"
3092    cmd = os.environ.get('DTALK_COMMAND_CODE','')
3093    if cmd: cmd=chr(int(cmd))
3094    else: cmd = as_utf8('*')
3095    return as_utf8("%sD%s%sT") % (cmd,pronunc,cmd)
3096 def markup_bbcMicro_word(pronunc):
3097    "Special-case function set as inline_format in bbcmicro.  Begins a new *SPEAK command when necessary.  See also write_bbcmicro_phones."
3098    global bbc_partsSoFar,bbc_charsSoFar
3099    thisPartCount = bbcMicro_partPhonemeCount(pronunc)
3100    if (not bbc_partsSoFar or bbc_partsSoFar+thisPartCount > 115) or (not bbc_charsSoFar or bbc_charsSoFar+len(pronunc) > 238): # 238 is max len of BBC BASIC prompt (both the immediate prompt and the one with line number supplied by AUTO, in both BASIC II and BASIC IV); re other limit see bbcMicro_partPhonemeCount
3101       if bbc_charsSoFar: r="\n"
3102       else: r=""
3103       cmd="*SPEAK" # (could add a space if want to make it more readable, at the expense of an extra keystroke in the paste buffer; by the way, when not using the ROM version you must use *SPEAK not OS.("SPEAK"), at least on a Model B; seems OSCLI doesn't go through quite the same vectors as star)
3104       bbc_charsSoFar = len(cmd)+len(pronunc)+1 # +1 for the space that'll be after this word if we don't start a new line
3105       bbc_partsSoFar = thisPartCount+1 # ditto
3106       return as_utf8(r+cmd)+pronunc
3107    else:
3108       bbc_charsSoFar += len(pronunc)+1
3109       bbc_partsSoFar += thisPartCount+1
3110       return pronunc
3111 bbc_partsSoFar=bbc_charsSoFar=0
3112
3113 def sylcount(example_format_festival):
3114   """Tries to count the number of syllables in a Festival string (see mainopt_syllables).  We treat @ as counting the same as the previous syllable (e.g. "fire", "power"), but this can vary in different songs, so the result will likely need a bit of proofreading."""
3115   count = inVowel = maybeCount = hadAt = 0
3116   festival = example_format_festival.split() # no brackets, emphasis by vowels, but spaces between each syllable
3117   for phone,i in zip(festival,range(len(festival))):
3118     if phone[:1] in "aeiou": inVowel=0 # unconditionally start new syllable
3119     if phone[:1] in "aeiou@12":
3120       if not inVowel: count += 1
3121       elif phone[:1]=="@" and not hadAt: maybeCount = 1 # (e.g. "loyal", but NOT '1', e.g. "world")
3122       if "@" in phone: hadAt = 1 # for words like "cheerful" ("i@ 1 @" counts as one)
3123       inVowel = 1
3124       if phone[:1]=="@" and i>=3 and festival[i-2:i]==["ai","1"] and festival[i-3] in ["s","h"]: # special rule for higher, Messiah, etc - like "fire" but usually 2 syllables
3125         maybeCount = 0 ; count += 1
3126     else:
3127       if not phone[:1] in "drz": count += maybeCount # not 'r/z' e.g. "ours", "fired" usually 1 syllable in songs, "desirable" usually 4 not 5
3128       # TODO steward?  y u@ 1 d but usally 2 syllables
3129       inVowel = maybeCount = hadAt = 0
3130   return count
3131 def hyphenate(word,numSyls):
3132   "See mainopt_syllables"
3133   orig = word
3134   try: word,isu8 = word.decode('utf-8'),True
3135   except: isu8 = False
3136   pre=[] ; post=[]
3137   while word and not 'a'<=word[:1].lower()<='z':
3138     pre.append(word[:1]) ; word=word[1:]
3139   while word and not 'a'<=word[-1].lower()<='z':
3140     post.insert(0,word[-1:]) ; word=word[:-1]
3141   if numSyls>len(word): return orig # probably numbers or something
3142   l = int((len(word)+numSyls/2)/numSyls) ; syls = []
3143   for i in range(numSyls):
3144     if i==numSyls-1: syls.append(word[i*l:])
3145     else: syls.append(word[i*l:(i+1)*l])
3146     if len(syls)>1:
3147       if syls[-1].startswith('-') or (len(syls[-1])>2 and syls[-1][:1]==syls[-1][1:2] and not syls[-1][:1].lower() in "aeiou"):
3148         # repeated consonant at start - put one on previous
3149         # (or hyphen at start - move it to the previous)
3150         syls[-2] += syls[-1][:1]
3151         syls[-1] = syls[-1][1:]
3152       elif len(syls[-1])>2 and syls[-1][1]=='-':
3153         # better move this splitpoint after that hyphen (TODO: move more than one character?)
3154         syls[-2] += syls[-1][:2]
3155         syls[-1] = syls[-1][2:]
3156       elif ((len(syls[-2])>2 and syls[-2][-1]==syls[-2][-2] and not syls[-2][-1].lower() in "aeiou") \
3157             or (syls[-1] and syls[-1][:1].lower() in "aeiouy" and len(syls[-2])>2)) \
3158             and list(filter(lambda x:x.lower() in "aeiou",list(syls[-2][:-1]))):
3159         # repeated consonant at end - put one on next
3160         # or vowel on right: move a letter over (sometimes the right thing to do...)
3161         # (unless doing so leaves no vowels)
3162         syls[-1] = syls[-2][-1]+syls[-1]
3163         syls[-2] = syls[-2][:-1]
3164   word = ''.join(pre)+"- ".join(syls)+''.join(post)
3165   if isu8: word=word.encode('utf-8')
3166   return word
3167
3168 def macSayCommand():
3169   """Return the environment variable SAY_COMMAND if it is set and if it is non-empty, otherwise return "say".
3170   E.g. SAY_COMMAND="say -o file.aiff" (TODO: document this in the help text?)
3171   In Gradint you can set (e.g. if you have a ~/.festivalrc) extra_speech=[("en","python lexconvert.py --mac-uk festival")] ; extra_speech_tofile=[("en",'echo %s | SAY_COMMAND="say -o /tmp/said.aiff" python lexconvert.py --mac-uk festival && sox /tmp/said.aiff /tmp/said.wav',"/tmp/said.wav")]"""
3172   s = os.environ.get("SAY_COMMAND","")
3173   if s: return s
3174   else: return "say"
3175
3176 def stdin_is_terminal():
3177    "Returns True if it seems the standard input is connected to a terminal (rather than piped from a file etc)"
3178    return (not hasattr(sys.stdin,"isatty")) or sys.stdin.isatty()
3179
3180 def getInputText(i,prompt,as_iterable=False):
3181   """Gets text either from the command line or from standard input.  Issue prompt if there's nothing on the command line and standard input is connected to a tty instead of a pipe or file.  If as_iterable, return an iterable object over the lines instead of reading and returning all text at once.  If as_iterable=='maybe', return the iterable but if not reading from a tty then read everything into one item."""
3182   txt = ' '.join(sys.argv[i:])
3183   if txt:
3184     if as_iterable=='maybe': return [txt]
3185     elif as_iterable: return txt.split('\n')
3186     else: return txt
3187   if stdin_is_terminal(): sys.stderr.write("Enter "+prompt+" (EOF when done)\n")
3188   elif as_iterable=='maybe': return [getBuf(sys.stdin).read()]
3189   if as_iterable: return my_xreadlines()
3190   else:
3191      try: return getBuf(sys.stdin).read()
3192      except KeyboardInterrupt: raise SystemExit
3193
3194 try: raw_input # Python 2
3195 except NameError: raw_input = input # Python 3
3196 def my_xreadlines():
3197    "On some platforms this might be a bit more responsive than sys.stdin.xreadlines"
3198    while True:
3199       try: yield raw_input()
3200       except EOFError: return
3201       except KeyboardInterrupt: raise SystemExit
3202
3203 def output_clauses(format,clauses):
3204    "Writes out clauses and words in format 'format' (clauses is a list of lists of words in the phones of 'format').  By default, calls markup_inline_word and join as appropriate.  If however the format's 'clause_separator' has been set to a special case, calls that."
3205    if checkSetting(format,"output_is_binary") and hasattr(sys.stdout,"isatty") and sys.stdout.isatty():
3206       print ("This is a binary format - not writing to terminal.\nPlease direct output to a file or pipe.")
3207       return
3208    clause_sep = checkSetting(format,"clause_separator","\n")
3209    if type(clause_sep) in [bytes,unicode]: getBuf(sys.stdout).write(as_utf8(clause_sep).join(as_utf8(wordSeparator(format)).join(markup_inline_word(format,word) for word in clause) for clause in clauses))
3210    else: clause_sep(clauses)
3211 def write_bbcmicro_phones(clauses):
3212   """Special-case function set as clause_separator in bbcmicro format.  Must be a special case because it needs to track any extra keystrokes to avoid "Line too long".  And while we're at it, we might as well start a new *SPEAK command with each clause, using the natural brief delay between commands; this should minimise the occurrence of additional delays in arbitrary places.  Also calls print_bbc_warnings"""
3213   totalKeystrokes = 0 ; lines = 0
3214   for clause in clauses:
3215     global bbc_charsSoFar ; bbc_charsSoFar=0
3216     l=as_utf8(" ").join([markup_inline_word("bbcmicro",word) for word in clause])
3217     getBuf(sys.stdout).write(l.replace(as_utf8(" \n"),as_utf8("\n")))
3218     totalKeystrokes += len(l)+1 ; lines += 1
3219   print_bbc_warnings(totalKeystrokes,lines)
3220 def print_bbc_warnings(keyCount,lineCount):
3221   "Print any relevant size warnings regarding sending 'keyCount' keys in 'lineCount' lines to the BBC Micro"
3222   sys.stdout.flush() # try to keep in sync if someone's doing 2>&1 | less
3223   limits_exceeded = [] ; severe=0
3224   if keyCount >= 32768:
3225     severe=1 ; limits_exceeded.append("BeebEm 32K keystroke limit") # At least in version 3, the clipboard is defined in beebwin.h as a char of size 32768 and its bounds are not checked.  Additionally, if you script a second paste before the first has finished (or if you try to use BeebEm's Copy command) then the first paste will be interrupted.  So if you really want to make BeebEm read more then I suggest setting a printer destination file, putting a VDU 2,10,3 after each batch of commands, and waiting for that \n to appear in that printer file before sending the next batch, or perhaps write a set of programs to a disk image and have them CHAIN each other or whatever.
3226   shadow_himem=0x8000 # if using a 'shadow mode' on the Master/B+/Integra-B (modes 128-135, which leave all main RAM free)
3227   mode7_himem=0x7c00 # (40x25 characters = 1000 bytes, by default starting at 7c00 with 24 bytes spare at the top, but the scrolling system uses the full 1024 bytes and can tell the video controller to start rendering at any one of them; if you get Jeremy Ruston's book and program the VIDC yourself then you could fix it at 7c18 if you really want, or just set HIMEM=&8000 and don't touch the screen, but that doesn't give you very much more room)
3228   default_speech_loc=0x5500
3229   overhead_per_program_line = 4
3230   for page,model in [
3231         (0x1900,"Model B"), # with Acorn DFS (a reasonable assumption although alternate DFS ROMs are different)
3232         (0xE00,"Master")]: # (the Master has 8k of special paged-in "filing system RAM", so doesn't need 2816 bytes of main RAM for DFS)
3233      top = page+keyCount+lineCount*(overhead_per_program_line-1)+2 # the -1 is because keyCount includes a carriage return at the end of each line
3234      if model=="Master": x=" (use Speech's Sideways RAM version instead, e.g. *SRLOAD SP8000 8000 7 and reset, but sound quality might be worse)" # I don't know why but SP8000 can play higher and more distorted than SPEECH, at least on emulation (and changing the emulation speed doesn't help, because that setting, at least in BeebEm3, just controls extra usleep every frame; it doesn't actually slow down the 6502 *between* frames; anyway timing of sound changes is done by CyclesToSamples stuff in beebsound.cc's SoundTrigger).  If on the Master you go into View (*WORD) and then try SP8000, it plays _lower_ than *SPEECH (even if you do *BASIC first) and *SAY can corrupt a View document; ViewSheet (*SHEET) doesn't seem to have this effect; neither does *TERMINAL but *SAY can confuse the terminal.
3235      # Re bank numbers, by default banks 4 to 7 are Sideways RAM (4*16k=64k) and I suppose filling up from 7 makes sense because banks 8-F are ROMs (ANFS,DFS,ViewSheet,Edit,BASIC,ADFS,View,Terminal; OS is a separate 16k so there's scope for 144k of supplied ROM).  Banks 0-3 are ROM expansion slots.  The "128" in the name "Master 128" comes from 32k main RAM, 64k Sideways RAM, 20k shadow RAM (for screen modes 128-135), 4k OS "private RAM" (paged on top of 8000-8FFF) and 8k filing system RAM (paged on top of C000-DFFF) = 128k.  Not sure what happened on the B+.
3236      # By the way BeebEm's beebsound.cc also shows us why SOUND was always out of tune especially in the higher pitches.  The 16-bit freqval given to the chip is 125000/freq and must be an integer, so the likely temperament in cents for non-PCM is given by [int(math.log(125000.0/math.ceil(125000/freq)/freq,2**(1.0/1200))) for freq in [440*((2**(1.0/12))**semi) for semi in range(-12*3+2,12*2+6)]] (the actual temperament will depend on the OS's implementation of mapping SOUND pitch values to freqval's, unless you program the chip directly, but this list is indicative and varies over 10% in the top 2 octaves)
3237      # Some other ROMs (e.g. Alan Blundell's "Informant" 1989) seem to result in a crash after the *SPEECH and/or *SPEAK commands complete, at least in some emulator configurations; this may or may not be resolved via timing adjustments or adjustments in the ROM order; not sure exactly what the problem is
3238      else: x=" (Speech program will be overwritten unless relocated)" # (could use Sideways RAM for it instead if you have it fitted, see above)
3239      if top > default_speech_loc: limits_exceeded.append("%s TOP=&%X limit%s" % (model,default_speech_loc,x)) # The Speech program does nothing to stop your program (or its variables etc) from growing large enough to overwrite &5500, nor does it stop the stack pointer (coming down from HIMEM) from overwriting &72FF. For more safety on a Model B you could use RELOCAT to put Speech at &5E00 and be sure to set HIMEM=&5E00 before loading, but then you must avoid commands that change HIMEM, such as MODE (but selecting any non-shadow mode other than 7 will overwrite Speech anyway, although if you set the mode before loading Speech then it'll overwrite screen memory and still work as long as the affected part of the screen is undisturbed).  You can't do tricks like ditching the lexicon because RELOCAT won't let you go above 5E00 (unless you fix it, but I haven't looked in detail; if you can fix RELOCAT to go above 5E00 then you can create a lexicon-free Speech by taking the 1st 0x1560 bytes of SPEECH and append two * bytes, relocate to &6600 and set HIMEM, but don't expect *SAY to work, unless you put a really small lexicon into the spare 144 bytes that are left - RELOCAT needs an xx00 address so you can't have those bytes at the bottom).  You could even relocate to &6A00 and overwrite (non-shadow) screen memory if you don't mind the screen being filled with gibberish that you'd better not erase! (well if you program the VIDC as mentioned above and you didn't re-add a small lexicon then you could get yourself 3.6 lines of usable Mode 7 display from the spare bytes but it's probably not worth the effort)
3240      if top > mode7_himem:
3241         if model=="Master":
3242            if top > shadow_himem: limits_exceeded.append(model+" 32k HIMEM limit (even for shadow modes)") # TODO: maybe add instructions for using BAS128 on the B+ or Master; this sets PAGE=&10000 and HIMEM=&20000 (i.e. 64k for programs), which uses all 4 SRAM slots so you can't use SP8000 (unless it's on a real ROM); if using Speech in main memory you need to RELOCAT it to leave &3000 upwards for Bas128 code; putting it at &1900 for B+/DFS leaves you only 417 bytes for lexicon (which might not matter if you're using only *SPEECH: just create a shortened lexicon); putting it at &E00 for Master allows space for the default 2204-byte lexicon with 1029 bytes to spare; TODO check if Bas128 uses any workspace between &E00 and &3000 though.  Alternatively (if you really want to store such a long program on the BBC) then you'd better split it into several programs that CHAIN each other (as mentioned above).
3243            else: limits_exceeded.append(model+" Mode 7 HIMEM limit (use shadow modes 128-135)")
3244         else: limits_exceeded.append(model+" Mode 7 HIMEM limit") # unless you overwrite the screen (see above) - let's assume the Model B hasn't been fitted with shadow modes (although the Integra-B add-on does give them to the Model B, and leaves PAGE at &1900; B+ has shadow modes but I don't know what's supposed to happen to PAGE on it).  65C02 Tube doesn't help much (it'll try to run Speech on the coprocessor instead of the host, and this results in silence because it can't send its sound back across the Tube; don't know if there's a way to make it run on the host in these circumstances or what the host's memory map is like)
3245   if lineCount > 32768: limits_exceeded.append("BBC BASIC line number limit") # and you wouldn't get this far without filling the memory, even with 128k (4 bytes per line)
3246   elif 10*lineCount > 32767: limits_exceeded.append("AUTO line number limit (try AUTO 0,1)") # (default AUTO increments in steps of 10; you can use AUTO 0,1 to start at 0 and increment in steps of 1.  BBC BASIC stores its line info in a compact form which allows a range of 0-32767.)
3247   if severe: warning,after="WARNING: ",""
3248   else: warning,after="Note: ","It should still work if pasted into BeebEm as immediate commands. "
3249   after = ". "+after+"See comments in lexconvert for more details.\n"
3250   if len(limits_exceeded)>1: sys.stderr.write(warning+"this text may be too big for the BBC Micro. The following limits were exceeded: "+", ".join(limits_exceeded)+after)
3251   elif limits_exceeded: sys.stderr.write(warning+"this text may be too big for the BBC Micro because it exceeds the "+limits_exceeded[0]+after)
3252 def bbc_prepDefaultLex(outFile):
3253   """Special-case function set as lex_header in bbcmicro format.  If SPEECH_DISK and MAKE_SPEECH_ROM is set, then read the ROM code from SPEECH_DISK and write to outFile (meant to go before the lexicon, to make a modified BBC Micro Speech ROM with custom lexicon)"""
3254   if not os.environ.get("MAKE_SPEECH_ROM",0): return
3255   sd = open(os.environ['SPEECH_DISK'])
3256   d=getBuf(sd).read() # if this fails, SPEECH_DISK was not set or was set incorrectly (it's required for MAKE_SPEECH_ROM)
3257   i=d.index(as_utf8('LO')+chr(0x80)+as_utf8('LP')+chr(0x80)+chr(0x82)+chr(0x11)) # start of SP8000 file (if this fails, it wasn't a Speech disk)
3258   j=d.index(as_utf8('>OUS_'),i) # start of lexicon (ditto)
3259   assert j-i==0x1683, "Is this really an original disk image?"
3260   getBuf(outFile).write(d[i:j])
3261 def bbc_appendDefaultLex(outFile):
3262   """Special-case function set as lex_footer in bbcmicro format.  If SPEECH_DISK is set, read Speech's default lexicon from it and append this to outFile.  Otherwise just write a terminating >** to outFile.  In either case, check for exceeding 16k if we're MAKE_SPEECH_ROM, close the file and call print_bbclex_instructions."""
3263   if os.environ.get("SPEECH_DISK",""):
3264      sd = open(os.environ['SPEECH_DISK'])
3265      d=getBuf(sd).read()
3266      i=d.index(as_utf8('>OUS_')) # if this fails, it wasn't a Speech disk
3267      j=d.index(as_utf8(">**"),i)
3268      assert j-i==2201, "Lexicon on SPEECH_DISK is wrong size (%d). Is this really an original disk image?" % (j-i)
3269      getBuf(outFile).write(d[i:j])
3270      # TODO: can we compress the BBC lexicon?  i.e. detect if a rule will happen anyway due to subsequent wildcard rules, and delete it if so (don't know how many bytes that would save)
3271   outFile.write(">**")
3272   fileLen = outFile.tell()
3273   assert not os.environ.get("MAKE_SPEECH_ROM",0) or fileLen <= 16384, "Speech ROM file got too big (%d)" % fileLen
3274   outFile.close()
3275   print_bbclex_instructions(getSetting("bbcmicro","lex_filename"),fileLen)
3276
3277 def bbcshortest(n):
3278   """Convert integer n into the shortest possible number of BBC Micro keystrokes; prefer hex if and only if the extra '&' keystroke won't make it any longer than its decimal equivalent"""
3279   if len(str(n)) < len('&%X'%n): return as_utf8(str(n))
3280   else: return as_utf8('&%X'%n)
3281 def bbcKeystrokes(data,start):
3282   "Return BBC BASIC keystrokes to put data into RAM starting at address start, without using the BASIC heap in the process (although we do use one of the page-4 integer variables to save some keystrokes).  Assumes the data is mostly ASCII so the $ operator is the least-keystrokes method of getting it in (rather than ? and ! operators, assembler EQUB/EQUW/EQUS, 6502 mnemonics, etc); we don't mind about overwriting the byte after with a CHR$(13).  Keystrokes are limited to ASCII for easier copy/paste.  See comments for more details."
3283   # Taken to the extreme, a 'find the least keystrokes' function would be some kind of data compressor; we're not doing that here as we assume this is going to be used to poke in a lexicon, which is basically ASCII with a few CHR$(128)s thrown in; this '$ operator' method is highly likely to yield the least keystrokes for that kind of data, apart from setting and using temporary string variables, but then (1) you're in the realms of data compression and (2) you require heap memory, which might not be a good idea depending on where we're putting our lexicon.
3284   # I suppose it wouldn't hurt in most cases to have an A$=CHR$(128), but not doing this for now because you might be in a situation where you can't touch the heap at all (I'm not sure where the workspace for assembling strings is though).
3285   # However, just to be pedantic about saving a few bytes, there is one thing we CAN do: if we have a lexicon with a lot of CHR$(128)s in it, let's set up BASIC's page-4 integer variables such that $A%=CHR$(128), saving 6 keystrokes per entry without needing the heap (an additional 1 keystroke per entry could be saved if we didn't mind putting an A$ on the heap).
3286   use_int_hack = ((start>=1030 or start+len(data)<=1026) and len(data.split(chr(128))) >= 4)
3287   i=0 ; ret=[]
3288   if use_int_hack: thisLine = as_utf8("A%=&408:B%=&D80:") # (@% is at &400 and each is 4 byte LSB-MSB; $x reads to next 0D)
3289   # (If we're guaranteed to NOT be using Bas128 and therefore all memory addresses are effectively masked by &FFFF, we can instead set A%=&D800406 (using A%'s low 2 bytes to point to A%'s high 2 bytes) for a 1-off saving of 5 keystrokes and 1 page-4 variable, but this saving is not really worth the readability compromise and the risk posed by the possibility of Bas128 - I don't know how Bas128 treats addresses above &1FFFF)
3290   # (An even 'nastier' trick would be to put !13=&D80 and then use $13, as those bytes are used by BASIC's random number generator, which presumably isn't called during the paste and we don't mind disrupting it; again I don't know about Bas128.  But you can't do it because BASIC gives a "$ range" error on anything below 256.)
3291   # (I suppose one thing you _could_ do is LOMEM=&400:A$=CHR$(13) and end with LOMEM=TOP, which would overwrite 3 page-4 variables and let you use just A$ instead of $A%, saving keystrokes over A%=&D800406 after 21 more lexicon words, at the expense of losing track of any variables you had on the heap.  But this is getting silly.)
3292   else: thisLine = as_utf8("")
3293   bbc_max_line_len = 238
3294   inQuote=needPlus=0 ; needCmd=1
3295   while i<len(data):
3296     if needCmd:
3297        thisLine += (as_utf8('$')+bbcshortest(start)+as_utf8('='))
3298        inQuote=needPlus=needCmd=0
3299     if data[i:i+1]==as_utf8('"'): c,inQ = as_utf8('""'),1 # inQ MUST be 0 or 1, not False/True, because it's also used as 'len of necessary close quote' below
3300     elif 32<=ord(data[i:i+1])<127: c,inQ = data[i:i+1],1
3301     elif use_int_hack and ord(data[i:i+1])==128: c,inQ=as_utf8("$A%"),0
3302     else: c,inQ=(as_utf8("CHR$("+str(ord(data[i:i+1]))+")")),0
3303     addToLine = [] ; newNeedPlus = needPlus
3304     if inQ and not inQuote:
3305        if needPlus: addToLine.append(as_utf8('+'))
3306        addToLine.append(as_utf8('"'))
3307        newNeedPlus=0
3308     elif inQuote and not inQ:
3309        addToLine.append(as_utf8('"+'))
3310        newNeedPlus=1 # after what we'll add
3311     elif not inQ:
3312        if needPlus: addToLine.append(as_utf8('+'))
3313        newNeedPlus=1 # after what we'll add
3314     addToLine.append(c)
3315     addToLine=as_utf8('').join(addToLine)
3316     if len(thisLine)+len(addToLine)+inQ > bbc_max_line_len: # oops, we've gone too far, back off and end prev line
3317        if inQuote: thisLine += as_utf8('"')
3318        ret.append(thisLine)
3319        thisLine=as_utf8("") ; needCmd=1 ; continue
3320     thisLine += addToLine ; inQuote=inQ
3321     needPlus=newNeedPlus ; i += 1 ; start += 1
3322   if inQuote: thisLine += as_utf8('"')
3323   if not needCmd: ret.append(thisLine)
3324   return as_utf8('\n').join(ret)+as_utf8('\n')
3325 def print_bbclex_instructions(fname,size):
3326  """Print suitable instructions for a BBC Micro lexicon of the given filename and size (the exact nature of the instructions depends on the size).  If appropriate, create a .key file containing keystrokes for transferring to an emulator."""
3327  if os.environ.get("MAKE_SPEECH_ROM",0): print ("%s (%d bytes, hex %X) can now installed on an emulator (set in Roms.cfg or whatever), or loaded onto a chip.  The sound quality of this might be worse than that of the main-RAM version." % (fname,size,size)) # (at least on emulation - see comment on sound quality above)
3328  else:
3329   print ("The size of this lexicon is %d bytes (hex %X)" % (size,size)) # (the default lexicon is 2204 bytes)
3330   bbcStart=None
3331   noSRAM_lex_offset=0x155F # (on the BBC Micro, SRAM means Sideways RAM, not Static RAM as it does elsewhere; for clarity we'd better say "Sideways RAM" in all output)
3332   SRAM_lex_offset=0x1683
3333   SRAM_max=0x4000 # 16k
3334   noSRAM_default_addr=0x5500
3335   noSRAM_min_addr=0xE00 # minimum supported by RELOCAT
3336   page=0x1900 # or 0xE00 for Master (but OK to just leave this at 0x1900 regardless of model; it harmlessly increases the range where special_relocate_instructions 'kick in')
3337   noSRAM_himem=0x7c00 # unless you're in a shadow mode or something (see comments on himem above), however leaving this at 0x7c00 is usually harmless (just causes the 'need to relocate' to 'kick in' earlier, although if memory is really full it might say 'too big' 1k too early)
3338   def special_relocate_instructions(reloc_addr):
3339     pagemove_min,pagemove_max = max(0xE00,page-0x1E00), page+0xE00 # if relocating to within this range, must move PAGE before loading RELOCAT. RELOCAT's supported range is 0xE00 to 0x5E00, omitting (PAGE-&1E00) to (PAGE+&E00)
3340     if reloc_addr < 0x1900: extra=" On a Model B with Acorn DFS you won't be able to use the disk after relocating below &1900, and you can't run star commands from tape so you have to initialise via CALL. (On a Master, DFS is not affected as it doesn't use &E00-&1900.)"
3341     else: extra = ""
3342     if not pagemove_min<=reloc_addr<pagemove_max:
3343       return extra # no other special instructions needed
3344     newpage = reloc_addr+0x1E00
3345     page_max = min(0x5E00,noSRAM_default_addr-0xE00)
3346     if newpage > page_max: return False # "Unfortunately RELOCAT can't put it at &%X even with PAGE changes." % reloc_addr
3347     return " Please run RELOCAT with PAGE in the range of &%X to &%X for this relocation to work.%s" % (newpage,page_max,extra)
3348   if noSRAM_default_addr+noSRAM_lex_offset+size > noSRAM_himem:
3349     reloc_addr = noSRAM_himem-noSRAM_lex_offset-size
3350     reloc_addr -= (reloc_addr%256)
3351     if reloc_addr >= noSRAM_min_addr:
3352       instr = special_relocate_instructions(reloc_addr)
3353       if instr==False: print ("This lexicon is too big for Speech in main RAM even with relocation, unless RELOCAT is rewritten to work from files.")
3354       else:
3355         bbcStart = reloc_addr+noSRAM_lex_offset
3356         reloc_call = reloc_addr + 0xB00
3357         print ("This lexicon is too big for Speech at its default address of &%X, but you could use RELOCAT to put a version at &%X and then initialise it with CALL %s (or do the suggested *SAVE, reset, and run *SP). Be sure to set HIMEM=&%X. Then *LOAD %s %X or change the relocated SP file from offset &%X.%s" % (noSRAM_default_addr,reloc_addr,bbcshortest(reloc_call),reloc_addr,fname,bbcStart,noSRAM_lex_offset,instr))
3358     else: print ("This lexicon is too big for Speech in main RAM even with relocation.")
3359   else: # fits at default location - no relocation needed
3360     bbcStart = noSRAM_default_addr+noSRAM_lex_offset
3361     print ("You can load this lexicon by *LOAD %s %X or change the SPEECH file from offset &%X. Suggest you also set HIMEM=&%X for safety." % (fname,bbcStart,noSRAM_lex_offset,noSRAM_default_addr))
3362   if bbcStart: # we managed to fit it into main RAM
3363      f = open(fname)
3364      keys = bbcKeystrokes(getBuf(f).read(),bbcStart)
3365      f = open(fname+".key","w")
3366      getBuf(f).write(keys)
3367      del f
3368      print ("For ease of transfer to emulators etc, a self-contained keystroke file for putting %s data at &%X has been written to %s.key" % (fname,bbcStart,fname))
3369      if len(keys) > 32767: print ("(This file looks too big for BeebEm to paste though)") # see comments elsewhere
3370   # Instructions for replacing lex in SRAM:
3371   if size > SRAM_max-SRAM_lex_offset: print ("This lexicon is too big for Speech in Sideways RAM.") # unless you can patch Speech to run in SRAM but read its lexicon from main RAM, or run in main RAM but page in multiple banks of SRAM for the lexicon (but even then there'll be a limit)
3372   else: print ("You can load this lexicon into Sideways RAM by *SRLOAD %s %X 7 (or whichever bank number you're using), or change the SP8000 file from offset &%X." % (fname,SRAM_lex_offset+0x8000,SRAM_lex_offset))
3373   if not os.environ.get("SPEECH_DISK",""): print ("If you want to append the default lexicon to this one, set SPEECH_DISK to the image of the original Speech disk before running lexconvert, e.g. export SPEECH_DISK=/usr/local/BeebEm3/diskimg/Speech.ssd")
3374   if size <= SRAM_max-SRAM_lex_offset: print ("You can also set MAKE_SPEECH_ROM=1 (along with SPEECH_DISK) to create a SPEECH.ROM file instead")
3375  print ("If you get 'Mistake in speech' when testing some words, try starting with '*SAY, ' (this seems to be a Speech bug)") # - can't track down which words it does and doesn't apply to
3376  print ("It might be better to load your lexicon into eSpeak and use lexconvert's --phones option to drive the BBC with phonemes.")
3377
3378 def mainopt_version(i):
3379    # TODO: doc string for the help? (or would this option clutter it needlessly) - just print lexconvert's version number and nothing else
3380    print (__doc__.split("\n")[0].split(" - ")[0])
3381
3382 def main():
3383     """Introspect the module to find the mainopt_ functions, and either call one of them or print the help.  Returns the error code to send back to the OS."""
3384     def funcToOpt(n): return "--"+n[n.index("_")+1:].replace("_","-")
3385     for k,v in globals().items():
3386         if k.startswith('mainopt_') and funcToOpt(k) in sys.argv:
3387            try: msg = v(sys.argv.index(funcToOpt(k)))
3388            except Message:
3389               # Python 2.6+ can have "except Message as e",
3390               # but Python 2.5 has to have "except Message,e"
3391               # which is disallowed in Python 3, so
3392               msg=sys.exc_info()[1].message
3393            if msg:
3394               sys.stdout.flush()
3395               sys.stderr.write(msg+"\n") ; return 1
3396            else: return 0
3397     html = ('--htmlhelp' in sys.argv) # (undocumented option used for my website, don't rely on it staying)
3398     def htmlify(h): return re.sub('(--[2A-Za-z-]*)',r'<kbd>\1</kbd>',h.replace('&','&amp;').replace('<','&lt;').replace('>','&gt;').replace('\n','<br>'))
3399     if not html: htmlify = lambda x:x
3400     print (htmlify(__doc__))
3401     if html: missALine = "<p>"
3402     else: missALine = ""
3403     print (missALine)
3404     if '--formats' in sys.argv: # non-HTML mode only (format descriptions are included in HTML anyway, and don't worry about the capability summary)
3405        print ("Available pronunciation formats (and support levels):")
3406        keys=list(lexFormats.keys()) ; keys.sort()
3407        for k in keys:
3408           types = []
3409           if not k=="example": types.append("phones")
3410           if k=="mac-uk": types.append("speaking")
3411           else:
3412              if checkSetting(k,"lex_read_function"): types.append("lex-read")
3413              if checkSetting(k,"lex_filename") and checkSetting(k,"lex_entry_format"):
3414                 ltype = checkSetting(k,"lex_type")
3415                 if ltype: ltype=" as "+ltype
3416                 types.append("lex-write"+ltype)
3417           print ("\n"+k+" ("+", ".join(types)+")")
3418           print (getSetting(k,"doc"))
3419        return 0
3420     elif html:
3421        print ("Available pronunciation formats:")
3422        if html: print ('<table id="formats">')
3423        keys=list(lexFormats.keys()) ; keys.sort()
3424        for k in keys: print ('<tr><td valign="top"><nobr>'+k+'</nobr></td><td valign="top">'+htmlify(getSetting(k,"doc"))+"</td></tr>")
3425        print ("</table><script><!-- try to be more readable on some smartphones\nif(((screen && screen.width<600) || navigator.userAgent.slice(-6)==\"Gecko/\" /* UC Browser? */) && document.getElementById && document.getElementById('formats').outerHTML) document.getElementById('formats').outerHTML = document.getElementById('formats').outerHTML.replace(/<table/g,'<dl').replace(/<.table/g,'<'+'/dl').replace(/<tr><td/g,'<dt').replace(/<.td><td/g,'<'+'/dt><dd').replace(/<.td><.tr/g,'<'+'/dd');\n//--></script>")
3426     else: print ("Available pronunciation formats: "+", ".join(sorted(list(lexFormats.keys())))+"\n(Use --formats to see their descriptions)")
3427     print (missALine)
3428     print ("Program options:")
3429     print (missALine)
3430     if html: print ("<dl>")
3431     for _,opt,desc in sorted([(not not v.__doc__ and not v.__doc__.startswith('*'),k,v.__doc__) for k,v in globals().items()]):
3432        if not opt.startswith("mainopt_"): continue
3433        opt = funcToOpt(opt)
3434        if not desc: continue # undocumented option
3435        params,rest = desc.split("\n",1)
3436        if params.startswith('*'): params=params[1:]
3437        if params: opt += (' '+params)
3438        if html: print ("<dt>"+htmlify(opt)+"</dt><dd>"+htmlify(rest)+"</dd>")
3439        else: print (opt+"\n"+rest+"\n")
3440     if html: print ("</dl>")
3441     return 0
3442
3443 catchingSigs = inSigHandler = False
3444 def catchSignals():
3445   "We had better try to catch all signals if using MacBritish_System_Lexicon so we can safely clean it up. We raise KeyboardInterrupt instead (need to catch this). Might not work with multithreaded code."
3446   global catchingSigs
3447   if catchingSigs: return
3448   catchingSigs = True
3449   import signal
3450   def f(sigNo,*args):
3451     global inSigHandler
3452     if inSigHandler: return
3453     inSigHandler = True
3454     os.killpg(os.getpgrp(),sigNo)
3455     sys.stderr.write("\nCaught signal %d\n" % sigNo)
3456     raise KeyboardInterrupt
3457   for n in xrange(1,signal.NSIG):
3458     if not n in [
3459           signal.SIGCHLD, # sent on subprocess completion
3460           signal.SIGTSTP,signal.SIGCONT, # Ctrl-Z / fg
3461           signal.SIGWINCH, # window-size change
3462     ] and not signal.getsignal(n)==signal.SIG_IGN:
3463       try: signal.signal(n,f)
3464       except: pass
3465 class MacBritish_System_Lexicon(object):
3466     """Overwrites some of the pronunciations in the system
3467     lexicon (after backing up the original).  Cannot
3468     change the actual words in the system lexicon, so just
3469     alters pronunciations of words you don't intend to use
3470     so you can substitute these into your texts.
3471     Restores the lexicon on close()."""
3472     instances = {}
3473     def __init__(self,text="",voice="Daniel"):
3474         """text is the text you want to speak (so that any
3475         words used in it that are not mentioned in your
3476         lexicon are unchanged in the system lexicon);
3477         text="" means you just want to speak phonemes.
3478         Special value of text=False means lexicon read only.
3479         voice can be Daniel, Emily or Serena."""
3480         self.voice = False
3481         if not text==False:
3482             assert not voice in MacBritish_System_Lexicon.instances, "There is already another instance of MacBritish_System_Lexicon for the "+voice+" voice"
3483             assert not os.system("lockfile -1 -r 10 /tmp/"+voice+".PCMWave.lock") # in case some other process has it (note: if you run with python -O, this check won't happen!)
3484             self.voice = voice # (don't set this if text==False, since we won't need cleanup on __del__)
3485         self.filename = "/System/Library/Speech/Voices/"+voice+".SpeechVoice/Contents/Resources/PCMWave"
3486         assert not (not os.path.exists(self.filename) and os.path.exists("/System/Library/Speech/Voices/"+voice+"Compact.SpeechVoice/Contents/Resources/PCMWave")), "The only installation of "+voice+" found on this system was the Compact one, which lexconvert does not yet support" # TODO: could try self.wordIndexStart = findW("Abiquiu"),self.phIndexStart = findW("'@b.Ik.ju"),self.wordIndexEnd = findW("www.youtube.com",1),self.phIndexEnd = findW("'d^b.l.ju.'d^b.l.ju.'d^b.l.ju.dA+t.'ju.'tjub.dA+t.kA+m",1), but "t" in phones should be ignored, "activesync" and "afterlife" have no phones, "aqua" has TWO sets of phonemes (aquarium ok) and there are other synchronization issues.
3487         # TODO: some sync issues persist even on the NON-Compact version in newer versions of macOS (e.g. 10.12).  This currently leads to exceptions in findW on such systems (which do say it could be due to wrong version of the voice); fixing would need looking at more sync issues as above
3488         assert os.path.exists(self.filename),"Cannot find an installation of '"+voice+"' on this system"
3489         if os.path.exists(self.filename+"0"):
3490             if text==False: self.filename += "0" # (use the backup file for read-only, if we created one before; this means we don't have to worry about locks)
3491         elif not text==False: # create a backup
3492             sys.stderr.write("Backing up "+self.filename+" to "+self.filename+"0...\n") # (you'll need a password if you're not running as root)
3493             err = os.system("sudo mv \""+self.filename+"\" \""+self.filename+"0\"; sudo cp \""+self.filename+"0\" \""+self.filename+"\"; sudo chown "+str(os.getuid())+" \""+self.filename+"\"")
3494             assert not err, "Error creating backup"
3495         lexFile = self.filename+".lexdir"
3496         if not os.path.exists(lexFile) and not text==False:
3497             sys.stderr.write("Creating lexdir file...\n")
3498             err = os.system("sudo touch \""+lexFile+"\" ; sudo chown "+str(os.getuid())+" \""+lexFile+"\"")
3499             assert not err, "Error creating lexdir"
3500         compat_err = "\nThis probably means your Mac has a new version of the voice that is no longer compatible with this system-lexicon patch."
3501         import cPickle
3502         if os.path.exists(lexFile) and os.stat(lexFile).st_size: self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd = cPickle.Unpickler(open(lexFile)).load()
3503         else:
3504             f = open(self.filename)
3505             dat = getBuf(f).read()
3506             def findW(word,rtnPastEnd=0):
3507                 i = re.finditer(re.escape(word+chr(0)),dat)
3508                 try: n = i.next()
3509                 except StopIteration: raise Exception(word+" not found in voice file"+compat_err)
3510                 try:
3511                     n2 = i.next()
3512                     raise Exception("%s does not uniquely identify a byte position (has at least %d and %d)%s" % (word,n.start(),n2.start(),compat_err))
3513                 except StopIteration: pass
3514                 if rtnPastEnd: return n.end()
3515                 else: return n.start()
3516             self.wordIndexStart = findW("808s")
3517             self.phIndexStart = findW("'e&It.o&U.e&Its")
3518             self.wordIndexEnd = findW("zombie",1)
3519             self.phIndexEnd = findW("'zA+m.bI",1)
3520             if not text==False: cPickle.Pickler(open(lexFile,"w")).dump((self.wordIndexStart,self.wordIndexEnd,self.phIndexStart,self.phIndexEnd))
3521         if text==False: self.dFile = open(self.filename)
3522         else: self.dFile = open(self.filename,'r+')
3523         assert len(self.allWords()) == len(self.allPh()), str(len(self.allWords()))+" words but "+str(len(self.allPh()))+" phonemes"+compat_err
3524         self.textToAvoid = u""
3525         if text==False: return
3526         MacBritish_System_Lexicon.instances[voice] = self
3527         self.textToAvoid = text.decode('utf-8').replace(unichr(160),' ') ; self.restoreDic = {}
3528         catchSignals()
3529     def allWords(self):
3530         "Returns a list of words that are defined in the system lexicon (which won't be changed, but see allPh)"
3531         self.dFile.seek(self.wordIndexStart)
3532         return [x for x in getBuf(self.dFile).read(self.wordIndexEnd-self.wordIndexStart).split(chr(0)) if x]
3533     def allPh(self):
3534         "Returns a list of (file position, phoneme string) for each of the primary phoneme entries from the system lexicon.  These entries can be changed in-place by writing to the said file position, and then spoken by giving the voice the corresponding word from allWords (but see also usable_words)."
3535         self.dFile.seek(self.phIndexStart)
3536         def f(l):
3537             last = None ; r = [] ; pos = self.phIndexStart
3538             for i in l:
3539                 if re.search(r'[ -~]',i) and not i in ["'a&I.'fo&Un","'lI.@n","'so&Un.j$"] and not (i==last and i in ["'tR+e&I.si"]): r.append((pos,i)) # (the listed pronunciations are secondary ones that for some reason are in the list)
3540                 if re.search(r'[ -~]',i): last = i
3541                 pos += (len(i)+1) # +1 for the \x00
3542             assert pos==self.phIndexEnd+1 # +1 because the last \00 will result in a "" item after; the above +1 will be incorrect for that item
3543             return r
3544         return f([x for x in getBuf(self.dFile).read(self.phIndexEnd-self.phIndexStart).split(chr(0))])
3545     def usable_words(self,words_ok_to_redefine=[]):
3546         "Returns a list of (word,phoneme_file_position,original_phonemes) by combining allWords with allPh, but omitting any words that don't seem 'usable' (for example words that contain spaces, since these lexicon entries don't seem to be actually used by the voice).  Words that occur in self.textToAvoid are also considered non-usable, unless they also occur in words_ok_to_redefine (user lexicon)."
3547         for word,(pos,phonemes) in zip(self.allWords(),self.allPh()):
3548             if not re.match("^[a-z0-9]*$",word): continue # it seems words not matching this regexp are NOT used by the engine
3549             if not (phonemes and 32<ord(phonemes[:1])<127): continue # better not touch those, just in case
3550             if word in self.textToAvoid and not word in words_ok_to_redefine: continue
3551             yield word,pos,phonemes
3552     def check_redef(self,wordsAndPhonemes):
3553         "Diagnostic function to list on standard error the 'redefinitions' we want to make.  wordsAndPhonemes is a list of (original system-lexicon word, proposed new phonemes).  The old phonemes are also listed, fetched from allPh."
3554         aw = self.allWords() ; ap = 0
3555         for w,p in wordsAndPhonemes:
3556           w = w.lower()
3557           if not re.match("^[a-z0-9]*$",w): continue
3558           if not w in aw: continue
3559           if not ap:
3560             ap = self.allPh()
3561             sys.stderr.write("Warning: some words were already in system lexicon\nword\told\tnew\n")
3562           sys.stderr.write(w+"\t"+ap[aw.index(w)][1]+"\t"+p+"\n")
3563     def speakPhones(self,phonesList):
3564         "Speaks every phonetic word in phonesList"
3565         words = [str(x)+"s" for x in range(len(phonesList))]
3566         d = self.setMultiple(words,phonesList)
3567         msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')
3568         getBuf(msc).write(as_utf8(" ").join(d.get(w,as_utf8("")) for w in words))
3569     def readWithLex(self,lex):
3570         "Reads the text given in the constructor after setting up the lexicon with the given (word,phoneme) list"
3571         # self.check_redef(lex) # uncomment if you want to know about these
3572         textToPrint = u' '+self.textToAvoid+u' '
3573         tta = ' '+self.textToAvoid.replace(u'\u2019',"'").replace(u'\u2032','').replace(u'\u00b4','').replace(u'\u02b9','').replace(u'\u00b7','').replace(u'\u2014',' ')+' ' # (ignore pronunciation marks 2032 and b7 that might be in the text, but still print them in textToPrint; also normalise apostrophes but not in textToPrint, and be careful with dashes as lex'ing the word after a hyphen or em-dash won't work BUT we still want to support hyphenated words IN the lexicon, so em-dashes are replaced here and hyphens are included in nonWordBefore below)
3574         words2,phonemes2 = [],[] # keep only the ones actually used in the text (no point setting whole lexicon)
3575         nonWordBefore=r"(?i)(?<=[^A-Za-z"+chr(0)+"-])" # see below for why chr(0) is included, and see comment above for why hyphen is at the end; (?i) = ignore case
3576         nonWordAfter=r"(?=([^A-Za-z'"+unichr(0x2019)+"-]|['"+unichr(0x2019)+r"-][^A-Za-z]))" # followed by non-letter non-apostrophe, or followed by apostrophe non-letter (so not if followed by "'s", because the voice won't use our custom lex entry if "'s" is added to the lex'd word, TODO: automatically add "'s" versions to the lexicon via +s or +iz?) (also not if followed by hyphen-letters; hyphen before start is handled above, although TODO preceded by non-letter + hyphen might be OK)
3577         ttal = tta.lower()
3578         for ww,pp in lex:
3579           ww = ww.decode('utf-8') # so you can add words with accents etc (in utf-8) to the lexicon
3580           if ww.lower() in ttal and re.search(nonWordBefore+re.escape(ww)+nonWordAfter,tta):
3581             words2.append(ww) ; phonemes2.append(pp)
3582         for k,v in self.setMultiple(words2,phonemes2).iteritems():
3583            tta = re.sub(nonWordBefore+re.escape(k)+nonWordAfter,chr(0)+v,tta)
3584            textToPrint = re.sub(nonWordBefore+'('+u'[\u2032\u00b4\u02b9\u00b7]*'.join(re.escape(c) for c in k)+')'+nonWordAfter,chr(0)+r'\1'+chr(1),textToPrint)
3585         tta = tta.replace(chr(0),'')
3586         term = os.environ.get("TERM","")
3587         if ("xterm" in term or term=="screen") and sys.stdout.isatty(): # we can probably underline words (inverse is more widely supported than underline, e.g. should work even on an old Linux console in case someone's using that to control an OS X server, but there might be a *lot* of words, which wouldn't be very good in inverse if user needs dark background and inverse is bright.  Unlike Annogen, we're dealing primarily with Latin letters.)
3588            import textwrap
3589            textwrap.len = lambda x: len(x.replace(chr(0),"").replace(chr(1),"")) # a 'hack' to make (at least the 2.x implementations of) textwrap ignore our chr(0) and chr(1) markers in their calculations.  Relies on textwrap calling len().
3590            print (textwrap.fill(textToPrint,stdout_width_unix(),break_on_hyphens=False).encode('utf-8').replace(chr(0),"\x1b[4m").replace(chr(1),"\x1b[0m").strip()) # break_on_hyphens=False because we don't really want hyphenated NAMES to be split across lines, and anyway textwrap in (at least) Python 2.7 has a bug that sometimes causes a line breaks to be inserted before a syllable marker symbol like 'prime'
3591         # else don't print anything (saves confusion)
3592         msc = os.popen(macSayCommand()+" -v \""+self.voice+"\"",'w')
3593         getBuf(msc).write(tta.encode('utf-8'))
3594     def setMultiple(self,words,phonemes):
3595         "Sets phonemes for words, returning dict of word to substitute word.  Flushes file buffer before return."
3596         avail = [] ; needed = []
3597         for word,pos,phon in self.usable_words(words):
3598             avail.append((len(phon),word,pos,phon))
3599         for word,phon in zip(words,phonemes):
3600             needed.append((len(phon),word,phon))
3601         avail.sort() ; needed.sort() # shortest phon first
3602         i = 0 ; wDic = {} ; iDone=set() ; mustBeAlpha=True
3603         # mustBeAlpha: prefer alphabetical words, since
3604         # these can be capitalised at start of sentence
3605         # (the prosody doesn't always work if it isn't)
3606         for l,word,phon in needed:
3607             while avail[i][0] < l or (mustBeAlpha and not re.match(as_utf8("[A-Za-z]"),avail[i][1])) or i in iDone:
3608                 i += 1
3609                 if i==len(avail):
3610                     if mustBeAlpha: # desperate situation: we HAVE to use the non-alphabetical slots now (ideally we should pick words that never occur at start of sentence for them, but this branch is hopefully a rare situation in practice)
3611                        mustBeAlpha=False ; i=0; continue
3612                     sys.stderr.write("Could not find enough lexicon slots!\n") # TODO: we passed 'words' to usable_words's words_ok_to_redefine - this might not be the case if we didn't find enough slots
3613                     self.dFile.flush() ; return wDic
3614             iDone.add(i)
3615             _,wSubst,pos,oldPhon = avail[i] ; i += 1
3616             if avail[i][2] in self.restoreDic: oldPhon=None # shouldn't happen if setMultiple is called only once, but might be useful for small experiments in the Python interpreter etc
3617             self.set(pos,phon,oldPhon)
3618             wDic[word] = wSubst[:1].upper()+wSubst[1:] # always capitalise it so it can be used at start of sentence too (TODO: copy original capitalisation of each instance instead, in case it happens to come directly after a dotted abbreviation? although if it's something that's always capitalised anyway, e.g. most names, then this won't make any difference)
3619         self.dFile.flush() ; return wDic
3620     def set(self,phPos,val,old=None):
3621         """Sets phonemes at position phPos to new value.
3622         Caller should flush the file buffer when done."""
3623         # print "Debugger: setting %x to %s" % (phPos,val)
3624         if old:
3625             assert not phPos in self.restoreDic, "Cannot call set() twice on same phoneme while re-specifying 'old'"
3626             assert len(val) <= len(old), "New phoneme is too long!"
3627             self.restoreDic[phPos] = old
3628         else: assert phPos in self.restoreDic, "Must specify old values (for restore) when setting for first time"
3629         self.dFile.seek(phPos)
3630         getBuf(self.dFile).write(val+as_utf8(chr(0)))
3631     def __del__(self):
3632         "WARNING - this might not be called before exit - best to call close() manually"
3633         if not self.voice: return
3634         self.close()
3635     def close(self):
3636         for phPos,val in self.restoreDic.items():
3637             self.set(phPos,val)
3638         self.dFile.close()
3639         del MacBritish_System_Lexicon.instances[self.voice]
3640         assert not os.system("rm -f /tmp/"+self.voice+".PCMWave.lock")
3641         self.voice=None
3642 def stdout_width_unix(): # assumes isatty
3643    import struct,fcntl,termios
3644    return struct.unpack('hh', fcntl.ioctl(1,termios.TIOCGWINSZ,'1234'))[1]
3645
3646 lexFormats = LexFormats() # at end, in case it refers to anything that was defined later
3647
3648 if __name__ == "__main__": sys.exit(main())