phoneme.c

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>

#include "english.h"

#define FALSE (0)
#define TRUE (!0)

/*
**      English to Phoneme translation.
**
**      Rules are made up of four parts:
**      
**              The left context.
**              The text to match.
**              The right context.
**              The phonemes to substitute for the matched text.
**
**      Procedure:
**
**              Seperate each block of letters (apostrophes included) 
**              and add a space on each side.  For each unmatched 
**              letter in the word, look through the rules where the 
**              text to match starts with the letter in the word.  If 
**              the text to match is found and the right and left 
**              context patterns also match, output the phonemes for 
**              that rule and skip to the next unmatched letter.
**
**
**      Special Context Symbols:
**
**              #       One or more vowels
**              :       Zero or more consonants
**              ^       One consonant.
**              .       One of B, D, V, G, J, L, M, N, R, W or Z (voiced 
**                      consonants)
**              %       One of ER, E, ES, ED, ING, ELY (a suffix)
**                      (Right context only)
**              +       One of E, I or Y (a "front" vowel)
*/

#ifndef ORIGINAL

char *copystr(s)
        char *s;
        {
        char *p;
                p = malloc(strlen(s)+1);
                strcpy(p, s);
                return(p);
        }

int read_one_rule(rfile, left, mid, right, rep)
        FILE *rfile;
        char *left;
        char *mid;
        char *right;
        char *rep;
        {
        char line[128], *s, *p;
        for (;;)
                {
                if (fgets(line, 128, rfile) == NULL)
                        return(FALSE);
                if (*line != '*') break;
                }
        s = line;
        p = s;
        while (*s != '|')
                {
                if (isalpha(*s)) *s = toupper(*s);
                if (*s == '<') *s = ' '; /* nothing */
                s++;
                }
        *s++ = '\0';
        strcpy(left, p);
        p = s;
        while (*s != '|')
                {
                if (isalpha(*s)) *s = toupper(*s);
                s++;
                }
        *s++ = '\0';
        strcpy(mid, p);
        p = s;
        while (*s != '=')
                {
                if (*s == '>') *s = ' ';  /* nothing */
                s++;
                }
        *s++ = '\0';
        strcpy(right, p);
        p = s;
        if (*s >= ' ')
                {
                s += 1;
                while (*s > ' ') s++;
                }
        *s++ = '\0';
        strcpy(rep, p);
        return(TRUE);
        }

void Init_Rules()
        {
#define max_rules 100 /* for now */
        FILE *rulefile;
        char left[128], mid[128], right[128], rep[128];
        int lastrule[27] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
        int i;

        rulefile = fopen("rules-navy", "r");
        if (rulefile == NULL)
                {
                fprintf(stderr, "Cannot open rule file 'rules-navy'\n");
                exit(1);
                }
        Rules = malloc(27*sizeof(Rule *));
        Rules[0] = punct_rules = malloc(max_rules * sizeof(Rule));
        Rules[1] = A_rules = malloc(max_rules * sizeof(Rule));
        Rules[2] = B_rules = malloc(max_rules * sizeof(Rule));
        Rules[3] = C_rules = malloc(max_rules * sizeof(Rule));
        Rules[4] = D_rules = malloc(max_rules * sizeof(Rule));
        Rules[5] = E_rules = malloc(max_rules * sizeof(Rule));
        Rules[6] = F_rules = malloc(max_rules * sizeof(Rule));
        Rules[7] = G_rules = malloc(max_rules * sizeof(Rule));
        Rules[8] = H_rules = malloc(max_rules * sizeof(Rule));
        Rules[9] = I_rules = malloc(max_rules * sizeof(Rule));
        Rules[10] = J_rules = malloc(max_rules * sizeof(Rule));
        Rules[11] = K_rules = malloc(max_rules * sizeof(Rule));
        Rules[12] = L_rules = malloc(max_rules * sizeof(Rule));
        Rules[13] = M_rules = malloc(max_rules * sizeof(Rule));
        Rules[14] = N_rules = malloc(max_rules * sizeof(Rule));
        Rules[15] = O_rules = malloc(max_rules * sizeof(Rule));
        Rules[16] = P_rules = malloc(max_rules * sizeof(Rule));
        Rules[17] = Q_rules = malloc(max_rules * sizeof(Rule));
        Rules[18] = R_rules = malloc(max_rules * sizeof(Rule));
        Rules[19] = S_rules = malloc(max_rules * sizeof(Rule));
        Rules[20] = T_rules = malloc(max_rules * sizeof(Rule));
        Rules[21] = U_rules = malloc(max_rules * sizeof(Rule));
        Rules[22] = V_rules = malloc(max_rules * sizeof(Rule));
        Rules[23] = W_rules = malloc(max_rules * sizeof(Rule));
        Rules[24] = X_rules = malloc(max_rules * sizeof(Rule));
        Rules[25] = Y_rules = malloc(max_rules * sizeof(Rule));
        Rules[26] = Z_rules = malloc(max_rules * sizeof(Rule));
        while (read_one_rule(rulefile, left, mid, right, rep))
                {
                int thisrule;
                if (isalpha(*mid) && isupper(*mid))
                        {
                        thisrule = *mid - 'A' + 1;
                        }
                else
                        {
                        thisrule = 0; /* punct */
                        }
/*
                fprintf(stderr, "Added rule[%d][%d] = {left=%s mid=%s right=%s rep=%s}\n",
                        thisrule, lastrule[thisrule],
                        left, mid, right, rep);
*/
                Rules[thisrule][lastrule[thisrule]][0] = copystr(left);
                Rules[thisrule][lastrule[thisrule]][1] = copystr(mid);
                Rules[thisrule][lastrule[thisrule]][2] = copystr(right);
                Rules[thisrule][lastrule[thisrule]][3] = copystr(rep);
                lastrule[thisrule] += 1;
                }
/*
        fprintf(stderr, "Rules read\n");
*/
        for (i = 0; i < 27; i++)
                {
                Rules[i][lastrule[i]][0] = Anything;
                Rules[i][lastrule[i]][1] = NULL;
                Rules[i][lastrule[i]][2] = Anything;
                Rules[i][lastrule[i]][3] = Silent;
                }
        }
#endif

int isvowel(chr)
        char chr;
        {
        return (chr == 'A' || chr == 'E' || chr == 'I' || 
                chr == 'O' || chr == 'U');
        }

int isconsonant(chr)
        char chr;
        {
        return (isupper(chr) && !isvowel(chr));
        }

// #ifndef ORIGINAL -- debugging -DORIGINAL

xlate_word(word)
        char word[];
        {
        int index;      /* Current position in word */
        int type;       /* First letter of match part */

        index = 1;      /* Skip the initial blank */
        do
                {
                if (isupper(word[index]))
                        type = word[index] - 'A' + 1;
                else
                        type = 0;

/*
fprintf(stderr, "find rule Rules[%d] at %p -> %p\n", type, &Rules[type], Rules[type]);
*/
                index = find_rule(word, index, Rules[type]);
                }
        while (word[index] != '\0');
        }

find_rule(word, index, rules)
        char word[];
        int index;
        Rule *rules;
        {
        Rule *rule;
        char *left, *match, *right, *output;
        int remainder;

        for (;;)        /* Search for the rule */
                {
                rule = rules++;
                match = (*rule)[1];

                if (match == 0) /* bad symbol! */
                        {
                        fprintf(stderr,
"Error: Can't find rule for: '%c' in \"%s\"\n", word[index], word);
                        return index+1; /* Skip it! */
                        }
/*
fprintf(stderr, "rule = %p, (*rule) = %p left=%s mid=%s right=%s rep=%s\n", rule, (*rule),
(*rule)[0], (*rule)[1], (*rule)[2], (*rule)[3]);
*/
                for (remainder = index; *match != '\0'; match++, remainder++)
                        {
                        if (*match != word[remainder])
                                break;
                        }

                if (*match != '\0')     /* found missmatch */
                        continue;
/*
printf("\nWord: \"%s\", Index:%4d, Trying: \"%s/%s/%s\" = \"%s\"\n",
    word, index, (*rule)[0], (*rule)[1], (*rule)[2], (*rule)[3]);
*/
                left = (*rule)[0];
                right = (*rule)[2];

                if (!leftmatch(left, &word[index-1]))
                        continue;
/*
printf("leftmatch(\"%s\",\"...%c\") succeded!\n", left, word[index-1]);
*/
                if (!rightmatch(right, &word[remainder]))
                        continue;
/*
printf("rightmatch(\"%s\",\"%s\") succeded!\n", right, &word[remainder]);
*/
                output = (*rule)[3];
/*
printf("Success: ");
*/
                outstring(output);
                return remainder;
                }
        }

// #endif -- DEBUGGING -DORIGINAL

leftmatch(pattern, context)
        char *pattern;  /* first char of pattern to match in text */
        char *context;  /* last char of text to be matched */
        {
        char *pat;
        char *text;
        int count;

        if (*pattern == '\0')   /* null string matches any context */
                {
                return TRUE;
                }

        /* point to last character in pattern string */
        count = strlen(pattern);
        pat = pattern + (count - 1);

        text = context;

        for (; count > 0; pat--, count--)
                {
                /* First check for simple text or space */
                if (isalpha(*pat) || *pat == '\'' || *pat == ' ')
                        {
                        if (*pat != *text)
                                {
                                return FALSE;
                                }
                        else
                                {
                                text--;
                                continue;
                                }
                        }

                switch (*pat)
                        {
                case '#':       /* One or more vowels */
                        if (!isvowel(*text))
                                return FALSE;

                        text--;

                        while (isvowel(*text))
                                text--;
                        break;

                case ':':       /* Zero or more consonants */
                        while (isconsonant(*text))
                                text--;
                        break;

                case '^':       /* One consonant */
                        if (!isconsonant(*text))
                                return FALSE;
                        text--;
                        break;

                case '.':       /* B, D, V, G, J, L, M, N, R, W, Z */
                        if (*text != 'B' && *text != 'D' && *text != 'V'
                           && *text != 'G' && *text != 'J' && *text != 'L'
                           && *text != 'M' && *text != 'N' && *text != 'R'
                           && *text != 'W' && *text != 'Z')
                                return FALSE;
                        text--;
                        break;

                case '+':       /* E, I or Y (front vowel) */
                        if (*text != 'E' && *text != 'I' && *text != 'Y')
                                return FALSE;
                        text--;
                        break;

                case '%':
                default:
                        fprintf(stderr, "Bad char in left rule: '%c'\n", *pat);
                        return FALSE;
                        }
                }

        return TRUE;
        }


rightmatch(pattern, context)
        char *pattern;  /* first char of pattern to match in text */
        char *context;  /* last char of text to be matched */
        {
        char *pat;
        char *text;

        if (*pattern == '\0')   /* null string matches any context */
                return TRUE;

        pat = pattern;
        text = context;

        for (pat = pattern; *pat != '\0'; pat++)
                {
                /* First check for simple text or space */
                if (isalpha(*pat) || *pat == '\'' || *pat == ' ')
                        {
                        if (*pat != *text)
                                {
                                return FALSE;
                                }
                        else
                                {
                                text++;
                                continue;
                                }
                        }

                switch (*pat)
                        {
                case '#':       /* One or more vowels */
                        if (!isvowel(*text))
                                return FALSE;

                        text++;

                        while (isvowel(*text))
                                text++;
                        break;

                case ':':       /* Zero or more consonants */
                        while (isconsonant(*text))
                                text++;
                        break;

                case '^':       /* One consonant */
                        if (!isconsonant(*text))
                                return FALSE;
                        text++;
                        break;

                case '.':       /* B, D, V, G, J, L, M, N, R, W, Z */
                        if (*text != 'B' && *text != 'D' && *text != 'V'
                           && *text != 'G' && *text != 'J' && *text != 'L'
                           && *text != 'M' && *text != 'N' && *text != 'R'
                           && *text != 'W' && *text != 'Z')
                                return FALSE;
                        text++;
                        break;

                case '+':       /* E, I or Y (front vowel) */
                        if (*text != 'E' && *text != 'I' && *text != 'Y')
                                return FALSE;
                        text++;
                        break;

                case '%':       /* ER, E, ES, ED, ING, ELY (a suffix) */
                        if (*text == 'E')
                                {
                                text++;
                                if (*text == 'L')
                                        {
                                        text++;
                                        if (*text == 'Y')
                                                {
                                                text++;
                                                break;
                                                }
                                        else
                                                {
                                                text--; /* Don't gobble L */
                                                break;
                                                }
                                        }
                                else
                                if (*text == 'R' || *text == 'S' 
                                   || *text == 'D')
                                        text++;
                                break;
                                }
                        else
                        if (*text == 'I')
                                {
                                text++;
                                if (*text == 'N')
                                        {
                                        text++;
                                        if (*text == 'G')
                                                {
                                                text++;
                                                break;
                                                }
                                        }
                                return FALSE;
                                }
                        else
                        return FALSE;

                default:
                        fprintf(stderr, "Bad char in right rule:'%c'\n", *pat);
                        return FALSE;
                        }
                }

        return TRUE;
        }