#!/usr/local/bin/perl -w # stress2sphinx # original by eht # # DX contexts: # left: # [+vowel] # [+vowel] R # ER0 # AXR0 # right: # [+vowel -stress] # # [+vowel -stress] [anything but "N"] (implemented as two opts) # ER0 # AXR0 # $dx_left = '\b[AEIOU][A-Z]\d|\b[AEIOU][A-Z]\d R|\bER0|\bAXR0'; $dx_right = '[AEIOU][A-Z]0$|[AEIOU][A-Z]0 [^N]|[AEIOU][A-Z]0 NG|ER0|AXR'; while ( <> ) { # Extract label, index, and pronunciation # Note that by separating pronunciation from other strings, # it is no longer possible to inadvertently corrupt the word label # (e.g., "A42128" becomes "A48" in new2sphinx) chop; next if ( ! /\S/ ); die "Bad dictionary format\n" if ( ! /((\S+))\s+/ ); $label = $1; $phonetic = $'; $word = $label; # RAH 2.19.2000 - what is the relevance of this? # Neither is used again and it seems that (2) for multiple pronunciations should remain # $index = 1; # $index = $1 if ( $word =~ s/\((\d+)\)// ); # Clean up pronunciation, remove extraneous spaces $_ = $phonetic; s/^\s+//; s/\s+$//; s/\s+/ /g; # 1. add flap (based on to-flap-newX.sed) # This is the crossproduct of left and right contexts # itemized above # Target: [TD] # Result: $1 DX $2 while ( s/\b(${dx_left}) [TD] (${dx_right})/$1 DX $2/g ) {} # 2. TS (based on tsX.sed) # s/\bT S\b/TS/g; # TS is no longer in the default phoneset # 3. deletable stops (based on deletableX.sed) # 2nd substitution makes explicit that +deletable precedes # DH, TH, and newly created TS # s/\b([BPDTGK]) ([BPDTGK])\b/$1D $2/g; # s/\b([BPDTGK]) (DH|TH|TS)\b/$1D $2/g; # s/\b([BPDTGK])$/$1D/g; # s/\bT L\b/TD L/g; # 3. Remove deletable stops # s/\bTD/T/g; # s/\bPD/T/g; # s/\bBD/T/g; # s/\bDD/T/g; # s/\bGD/T/g; # s/\bKD/T/g; # 4. unstressed Sphinx 2 forms (based on postmodX.sed) s/\bER0\b/AXR0/g; s/\bIH0\b/IX0/g; s/\bAH0\b/AX0/g; s/\bIY0 NG\b/IX0 NG/g; # Note restrictions on IY0 transforms. According to postmodX.sed: # IY0 must be (1) word-initial or after word-initial B or D # and (2) not at the end of the word # Hence, "BE(2) B IY0" is not transformed to "B IX" but to "B IY" # and collapses with "BE B IY1". Is this as intended? s/^IY0 /IX0 /g; s/^B IY0 /B IX0 /g; s/^D IY0 /D IX0 /g; # 5. remove the stress information # Note that the sed 's/0//g' etc. commands will remove [012] # from the word label as well. s/[012]//g; printf "%-30s %s\n", $word, $_; }