/*

XLATE.C - by Pope Lou Duchez

This is a filter program to translate text from one language into
another.  Supply two file names: one that lists words to replace,
and another that is the source text you want to translate.  Output
goes to the standard output device, and can be routed to a file
or the printer.

REVISION HISTORY

04-29-95: Initial release

05-01-95: Bug fix: eliminated error where program would miss text to
            translate if the same text was found earlier on the line, but
            in the middle of a word (this was a problem only if the text
            needed to start or end a word)

          Code is documented

          Upper-casing code is revised to be ANSI-compliant

          Quotes may be indicated inside English or foreign texts by
            using two quotes in immediate succession

05-02-95: WRDSIZ is increased from 30 to 60 to accommodate longer texts

          When called without parameters, XLATE lists the revision date

05-06-95: Word wrap option added

          Third file (destination file) may be named on command line

          File names specified on errors

05-07-95: Stopped capitalizing command-line options to make things work
            properly under Unix

11-14-95: Clean up extraneous code that the Bearded Guy's compiler spotted

04-21-96: If identical English entries are found in the lexicon, select
            one at semi-random

09-24-96: Input file is now optional -- takes input from STDIN if no
            file is specified (useful for piping operations)

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define LAST_REVISED "09/24/96"

#define WRDSIZ 60       /* max length of English/foreign texts in lexicon */
#define LEXMAX 500      /* max # of lexicon entries */
#define STRSIZ 1023     /* standard string length for internal processing */

#define BOOLEAN char    /* boolean variables */
#define TRUE 1
#define FALSE 0

#define MAXWIDTH STRSIZ / 2
#define MINWIDTH 20

struct xword /* Dictionary entry - "English" text and "foreign" xlation */
{
  char english[WRDSIZ + 1], foreign[WRDSIZ + 1];  /* the words themselves  */
  int englen, forlen;                             /* lengths of words      */
  int foundpos;                                   /* where found in line   */
  BOOLEAN must_start_word, must_end_word;         /* whether English text  */
};                                                /*   must start/end word */

struct xword lexicon[LEXMAX + 1];   /* lexicon */
int lexcnt;                         /* how many words are in lexicon */
BOOLEAN out_to_file;
FILE *outfile;
/*
-------------------------------------------------
*/
void error_out(char *errmsg)        /* Error handler: print message & exit */
{
  printf("\007\n%s\n", errmsg);
  exit(1);
}
/*
-------------------------------------------------
*/
void str_toupper(char *strin)       /* convert string to uppercase */
{
  char *chrptr;
  for (chrptr = strin; *chrptr != '\0'; chrptr++) *chrptr = toupper(*chrptr);
}
/*
-------------------------------------------------
*/
void read_lexicon(char *filein)   /* read lexicon file */
{
  FILE *fp;
  char linein[STRSIZ + 1], errmsg[STRSIZ + 1],
       tmpeng[WRDSIZ + 1], tmpfor[WRDSIZ + 1];
  int si, di, eng_offset;
  BOOLEAN line_err, in_quotes, tmp_must_start, tmp_must_end;

  fp = fopen(filein, "r");                                    /* open file */
  if (fp == NULL)
  {
    sprintf(errmsg, "Error opening lexicon file %s", filein);
    error_out(errmsg);
  }
  lexcnt = -1;
  while (fgets(linein, STRSIZ, fp) != NULL)           /* read file to end */
  {
    si = 0;
    line_err = FALSE;
    
    /* step past first quotation mark on line ... */
    
    while ((linein[si] != '\0') && (linein[si] != '"')) si++;
    if (linein[si] == '\0') line_err = TRUE;
                       else si++;
    
    /* Transfer chars into temporary "English text" buffer, until a
       terminating quote is seen or the line ends.  (Two quotes in
       immediate succession are treated as a single quote that's part
       of the English text.) */
    
    in_quotes = TRUE;
    di = 0;
    while ((linein[si] != '\0') && (di <= WRDSIZ) && (in_quotes))
    {
      if (linein[si] == '"')
      {
        if (linein[si + 1] == '"') { tmpeng[di] = '"'; di++; si++; }
                               else { in_quotes = FALSE; }
      }
      else { tmpeng[di] = linein[si]; di++; }
      si++;
    }
    tmpeng[di] = '\0';
    
    /* Determine if English text started or ended with a space: they
       would indicate that the English text must be found starting/
       ending a word if it's going to be translated. */
    
    tmp_must_start = (tmpeng[0] == ' ');
    if (di > 0) tmp_must_end = (tmpeng[di - 1] == ' ');
    
    /* Clip all spaces off the end of the English text, and set an
       "offset" index to the start of the text after any leading
       spaces. */
    
    di--;
    while ((di >= 0) && (tmpeng[di] == ' ')) tmpeng[di--] = '\0';
    eng_offset = 0;
    while (tmpeng[eng_offset] == ' ') eng_offset++;
    if (tmpeng[eng_offset] == '\0') line_err = TRUE;
    
    /* Skip past the comma that separates English/foreign texts. */
    
    while ((linein[si] != '\0') && (linein[si] != ',')) si++;
    if (linein[si] != ',') line_err = TRUE;
    
    /* Skip past the quote that starts the foreign text. */
    
    while ((linein[si] != '\0') && (linein[si] != '"')) si++;
    if (linein[si] == '"') si++;
                      else line_err = TRUE;
    
    /* Transfer text into the "foreign" buffer until a terminating
       quote is encountered.  Again, two quotes in immediate succession
       are treated as a single quote that's part of the foreign text. */
    
    in_quotes = TRUE;
    di = 0;
    while ((linein[si] != '\0') && (di <= WRDSIZ) && (in_quotes))
    {
      if (linein[si] == '"')
      {
        if (linein[si + 1] == '"') { tmpfor[di] = '"'; di++; si++; }
                               else { in_quotes = FALSE; }
      }
      else { tmpfor[di] = linein[si]; di++; }
      si++;
    }
    tmpfor[di] = '\0';
    if (in_quotes) line_err = TRUE;
    
    /* If no errors, copy temporary strings into new lexicon entry. */
    
    if (!line_err)
    {
      if (lexcnt < LEXMAX)    /* Add entry only if there's room in array */
      {
        lexcnt++;
        strcpy(lexicon[lexcnt].english, tmpeng + eng_offset);
        strcpy(lexicon[lexcnt].foreign, tmpfor);
        str_toupper(lexicon[lexcnt].english);
        lexicon[lexcnt].must_start_word = tmp_must_start;
        lexicon[lexcnt].must_end_word = tmp_must_end;
        lexicon[lexcnt].englen = strlen(lexicon[lexcnt].english);
        lexicon[lexcnt].forlen = strlen(lexicon[lexcnt].foreign);
      }
      else
      {
        sprintf(errmsg,
                "Error: lexicon %s goes over the limit of %d entries.",
                filein, LEXMAX);
        error_out(errmsg);
      }
    }
    if (line_err) printf("Invalid lexicon line:\n %s\n", linein);
  }
  
  /* Done reading lexicon. */
  
  fclose(fp);
  if (lexcnt == -1)
  {
    sprintf(errmsg, "Empty lexicon file %s - program terminating.", filein);
    error_out(errmsg);
  } 
}
/*
-------------------------------------------------
*/
void txtout(char *linein, char *bufout, BOOLEAN wrapit, int width,
            BOOLEAN to_file, FILE *fout, BOOLEAN fp_eof)
{
  char buffer[STRSIZ + 1];
  int bufpos;
  BOOLEAN printit;

  if (wrapit && (bufout[0] != '\0'))
  {
    strcat(bufout, " ");
    strcat(bufout, linein + 1);
  } 
  else strcpy(bufout, linein + 1);

  do      /* loop for outputting text */
  {
    if (wrapit)   /* wrapping text */
    {
      if (strlen(bufout) > width)      /* find wrap point in buffer */
      {
        bufpos = width;
        while ((bufpos >= 1) && (bufout[bufpos] != ' ')) bufpos--;
        if (bufpos < 1) bufpos = width;
        while (bufout[bufpos] == ' ') bufout[bufpos++] = '\0';
        strcpy(buffer, bufout + bufpos);
        bufout[bufpos] = '\0';
        printit = TRUE;
      }
      else if (fp_eof || (linein[1] == '\0'))
      {
        printit = TRUE;           /* End of file or end of paragraph: */
        buffer[0] = '\0';         /* output entire buffer */
      }
      else printit = FALSE;
    }
    else
    {
      printit = TRUE;  /* No wrap: output entire buffer */
      buffer[0] = '\0';
    } 

    if (printit)  /* We have output: print it out */
    {
      if (to_file) fprintf(fout, "%s\n", bufout);
              else printf("%s\n", bufout);
      strcpy(bufout, buffer);
    }
  } while ((wrapit) && ((strlen(bufout) > width) ||
           ((fp_eof || (linein[1] == '\0')) && (bufout[0] != '\0'))));

  if (wrapit && (linein[1] == '\0'))  /* write blank line in wrap mode */
  {
    if (to_file) fprintf(fout, "\n");
            else printf("\n");
  }
}
/*
-------------------------------------------------
*/
void xlate_file(BOOLEAN from_file, char *fname, BOOLEAN to_file,
                char *foutname, BOOLEAN wrapit, int width)
{
  FILE *fp, *fout;
  char linein[2*STRSIZ + 3], caplinein[STRSIZ + 3], buffer[STRSIZ + 1],
       bufout[STRSIZ + 1], errmsg[STRSIZ + 1];
  char *txtptr, *front, *back, *startptr, *tmptxtptr;
  int lexstep, offset, i, lexpos, bestlex, bestlexpos, foundcount;
  BOOLEAN wordchanged, good_front, good_back, setnewlex, lead_up, fp_eof;
  
  if (from_file) fp = fopen(fname, "r");                    /* open file */
            else fp = stdin;
  if (fp == NULL)
  {
    sprintf(errmsg, "Error opening source file %s", fname);
    error_out(errmsg);
  } 
  
  if (to_file)      /* optional output to file */
  {
    fout = fopen(foutname, "w");
    if (fout == NULL)
    {
      sprintf(errmsg, "Error opening destination file %s", foutname);
      error_out(errmsg);
    }
  }
  
  bufout[0] = '\0';
  
  /* Trick code (sort of): To determine whether any English text is at
     the start of a word, the program looks at the character appearing
     just before the English text.  At element 0 of a string, this could
     be a problem.  Solution: set element 0 of string to a space, and
     read text in / spit text out starting at element 1 of the string. */
  
  linein[0] = ' ';
  do
  {
    fp_eof = (fgets(linein + 1, STRSIZ, fp) == NULL);
  
    /* Remove any CRs, LFs, spaces, tabs from the end of the line. */
  
    i = strlen(linein) - 1;
    while ((i >= 1) && ((linein[i] == 13) || (linein[i] == 10) ||
           (linein[i] == ' ') || linein[i] == 8))
          linein[i--] = '\0';

    /* Keep doing the following loop as long as text can be found to
       translate on the line; quit processing the line when no more
       translations can be done. */
       
    /* Note variable "offset".  If part of a line is translated, any
       subsequent translations must occur after the replacement text
       you've put in (otherwise, you risk translating your translations).
       Variable "offset" keeps track of where we want to start looking
       on the line for text to translate; whenever a translation occurs,
       "offset" is reset to point *after* the replacement text.  (Of
       course, this all means that the program must perform all
       translations in left-to-right order ...) */

    offset = 0;
    do
    {
      strcpy(caplinein, linein);  /* Copy text line and capitalize it */
      str_toupper(caplinein);
      startptr = caplinein + offset;  /* Convert "offset" to char ptr */
      bestlex = -1;
      for (lexstep = 0; lexstep <= lexcnt; lexstep++) /* look for each word */
      {
        
        /* In the following loop, the program will try to locate the
           current English text in the capitalized text string (starting
           where "offset" was set to).  This following code needs to be
           a loop, not just a simple "strstr", because we are looking for
           the English text in the proper context: at the start or end
           of a word.  It may be that the first time the English text
           appears, it's in the middle of a word but it has to start or
           end a word for it to get translated.  So we need to skip that
           instance in the string, and see if the English text appears
           again in the right context. */
        
        tmptxtptr = startptr;
        do
        {
          good_front = TRUE;
          good_back = TRUE;
          txtptr = strstr(tmptxtptr, lexicon[lexstep].english); /* seek */
          if (txtptr == NULL)
          {

            /* Text not found -- flag as unfound. */
          
            lexicon[lexstep].foundpos = -1;
          }
          else
          {
          
            /* Text found -- now check context. */
          
            front = txtptr - 1;
            back = txtptr + lexicon[lexstep].englen;
            good_front = !(lexicon[lexstep].must_start_word && isupper(*front));
            good_back = !(lexicon[lexstep].must_end_word && isupper(*back));
            if (good_front && good_back)
            {
            
              /* Text is in right context.  Now is it the "best"
                 translatable text identified so far?  It is, if no
                 other text has been found at an earlier spot on
                 the line.  (Another "positive" condition: if other
                 text was found at the exact same spot and this new
                 text is longer.  This would occur if there were
                 two lexicon English texts that start with the same
                 few letters: pick the longer one.) */
              
              lexpos = (int) (txtptr - caplinein);
              lexicon[lexstep].foundpos = lexpos;
              if (bestlex == -1) setnewlex = TRUE;
              else if (lexpos < bestlexpos) setnewlex = TRUE;
              else if (lexpos > bestlexpos) setnewlex = FALSE;
              else if (lexicon[lexstep].englen > lexicon[bestlex].englen)
                       setnewlex = TRUE;
              else if (lexicon[lexstep].englen < lexicon[bestlex].englen)
                       setnewlex = FALSE;
              else if (lexicon[lexstep].englen == lexicon[bestlex].englen)
              {
              
                /* Another identical English entry found -- don't
                   set the "setnewlex" flag, but increment the number
                   of identical English "best matches" found. */
              
                foundcount++;
                setnewlex = FALSE;
              }
              else setnewlex = FALSE;
              
              /* If this is the best text, flag it. */
              
              if (setnewlex)
              {
                bestlex = lexstep;
                bestlexpos = lexpos;
                foundcount = 1;
              }
            }
            else  /* wrong context -- look for subsequent instances */
            {
              lexicon[lexstep].foundpos = -1;
              tmptxtptr = txtptr + lexicon[lexstep].englen;
            }
          }
        } while (!good_front || !good_back);
      }
      wordchanged = (bestlex > -1);
      
      /* If translatable text was found: 
      
         1 - Save string that follows translatable text
         2 - Clip string at point where translatable text starts
         3 - Concatenate "foreign" text to clipped string
         4 - Concatenate string that was saved in step #1
      
       */
      
      if (wordchanged)
      {
        if (foundcount > 1)
        
        /* More than one identical applicable English entries were
           found.  Semi-randomly decide which one to use based on
           line length and text position..  Step through the lexicon
           to find the nth identical entry and set "bestlex" to point
           to that entry. */
        
        {
          i = (strlen(linein) + bestlexpos + foundcount - 1) % foundcount + 1;
          for (lexstep = 0; lexstep <= lexcnt; lexstep++)
          {
            if ((lexicon[lexstep].foundpos == lexicon[bestlex].foundpos) &&
                (lexicon[lexstep].englen   == lexicon[bestlex].englen))
            {
              i--;
              if (i == 0) bestlex = lexstep;
            }
          }
        }
        strcpy(buffer, linein + bestlexpos + lexicon[bestlex].englen);
        lead_up = (isupper(linein[bestlexpos]));
        linein[bestlexpos] = '\0';
        strcat(linein, lexicon[bestlex].foreign);
        offset = bestlexpos + lexicon[bestlex].forlen;
        strcat(linein, buffer);
        
        /* Make sure text hasn't grown beyond a sensible limit. */
        
        linein[STRSIZ] = '\0';
        
        /* If the English text started with a capital, make sure that
           foreign text also starts with a capital. */
        
        if (lead_up) linein[bestlexpos] = toupper(linein[bestlexpos]);
      }
    } while (wordchanged);
    txtout(linein, bufout, wrapit, width, to_file, fout, fp_eof); /* Output */
  } while (!fp_eof);
  
  /* Done -- close file. */
  
  if (from_file) fclose(fp);
  if (to_file) fclose(fout);
}
/*
-------------------------------------------------
*/
void main(int argc, char *argv[])   /* main drag */
{
  int i, tmpwidth, width;
  BOOLEAN got_lex, got_src, got_dst, wrapit;
  char thisopt[STRSIZ + 1], lex[STRSIZ + 1], src[STRSIZ + 1], dst[STRSIZ + 1];
  got_lex = FALSE;
  got_src = FALSE;
  got_dst = FALSE;
  wrapit = FALSE;
  for (i = 1; i < argc; i++)
  {
    strcpy(thisopt, argv[i]);
    if (thisopt[0] == '-')
    {
      if ((thisopt[1] == 'W') || (thisopt[1] == 'w'))
      {
        sscanf(thisopt + 2, "%d", &tmpwidth);
        if ((tmpwidth >= MINWIDTH) && (tmpwidth <= MAXWIDTH))
        {
          wrapit = TRUE;
          width = tmpwidth;
        }
      }
    }
    else if (!got_lex)
    {
      got_lex = TRUE;
      strcpy(lex, thisopt);
    }
    else if (!got_src)
    {
      got_src = TRUE;
      strcpy(src, thisopt);
    }
    else if (!got_dst)
    {
      got_dst = TRUE;
      strcpy(dst, thisopt);
    }
  }
  if (!got_lex)
  {
    printf("XLATE -- by Pope Lou Duchez ... rev. %s\n\n", LAST_REVISED);
    printf("SYNTAX: XLATE lexicon source [dest] [-Wnnn]\n\n");
    printf("  where lexicon = name of file with words to translate\n");
    printf("        source = file whose text you want to translate\n");
    printf("        dest = optional destination output file\n");
    printf("        -W = word wrap: nnn = how many chars to wrap at\n\n");
    printf("Syntax of lexicon file ... Each line follows this format:\n\n");
    printf("   \"english_word\",\"foreign_equivalent\"\n\n");
    printf("Leading/ending spaces on the English text indicate that it should be translated\n");
    printf("only at the start/end of a word.  \"man\" would be translated wherever it might\n");
    printf("appear in a word (e.g., \"humanity\"); \" man\" would be translated only if it\n");
    printf("started a word (\"mankind\"); \"man \" translates only if it ends a word\n");
    printf("(\"Batman\").  \" man \" would indicate the exact word \"man\".\n\n");
    printf("Note that spaces and punctuation are allowed within both the English and\n");
    printf("foreign texts (2 quotes = embedded quote).\n");
  }
  else
  {
    read_lexicon(lex);
    if (!got_src) src[0] = '\0';
    xlate_file(got_src, src, got_dst, dst, wrapit, width);
  }
}
