#!/usr/bin/perl

##############################################################################
# adapted from C++ code Copyright (C) 2000 Falk Hueffner
# This version Copyright (C) 2002 Eric House (fixin@peak.org)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
##############################################################################

# inputs: 0. Name of file mapping letters to 0..31 values.  In English
# case just contains A..Z.  This will be used to translate the tries
# on output.
#         1. Max number of bytes per binary output file.
#
#         2. Basename of binary files for output.

#         3. Name of file to which to write the number of the
# startNode, since I'm not rewriting a bunch of code to expect Falk's
# '*' node at the start.
#

#         In STDIN, the text file to be compressed.  It absolutely
# must be sorted.  The sort doesn't have to follow the order in the
# map file, however.

# This is meant eventually to be runnable as part of a cgi system for
# letting users generating Crosswords dicts online.


use strict;
use POSIX;

my $gFirstDiff;
my @gCurrentWord;
my $gCurrentWord;               # save so can check for sortedness
my $gDone = 0;
my @gInputStrings;
my $gNeedsSort = 1;             # read from cmd line eventually
my @gNodes;                       # final array of nodes
my $gNBytesPerOutfile = 0xFFFFFFFF;
my $gTableFile;
my $gOutFileBase;
my $gStartNodeOut;
my $gInFileName;
my $gKillIfMissing = 1;
my $gTermChar = '/n';
my $gDumpText = 0;              # dump the dict as text after?
my $gCountFile;
my $gWordCount = 0;
my %gTableHash;
my @gRevMap;
my $debug = 0;
my %gSubsHash;
my $gForceFour = 0;             # use four bytes regardless of need?

main();

exit;

##############################################################################

sub main() { 

    if ( !parseARGV() ) {
        usage();
        exit();
    }

    makeTableHash();

    my $infile;

    if ( $gInFileName ) {
        open $infile, "<$gInFileName";
    } else {
        $infile = \*STDIN;
    }

    @gInputStrings = parseAndSort( $infile );
    if ( $gInFileName ) {
        close $infile;
    }

    # Do I need this stupid thing?  Better to move the first row to
    # the front of the array and patch everything else.  Or fix the
    # non-palm dictionary format to include the offset of the first
    # node.

    my $dummyNode = 0xFFFFFFFF;
    @gNodes = ( $dummyNode );
    
    readNextWord();

    my $firstRootChildOffset = buildNode(0);

    moveTopToFront( \$firstRootChildOffset );

    if ( $gStartNodeOut ) {
        writeOutStartNode( $gStartNodeOut, $firstRootChildOffset );
    }

    print STDERR "\n... dumping table ...\n" if $debug;
    printNodes( \@gNodes, "done with main" ) if $debug;

    # write out the number of nodes if requested
    if ( $gCountFile ) {
        open OFILE, "> $gCountFile";
        print OFILE pack( "N", $gWordCount );
        close OFILE;
        print STDERR "wrote out: got $gWordCount words\n";
    }
 
    if ( $gOutFileBase ) {
        emitNodes( $gNBytesPerOutfile, $gOutFileBase );
    }

    if ( $gDumpText && @gNodes > 0 ) {
        printOneLevel( $firstRootChildOffset, "" );
    }
} # main

# We now have an array of nodes with the last subarray being the
# logical top of the tree.  Move them to the start, fixing all fco
# refs, so that legacy code like Palm can assume top==0.
#
# Note: It'd probably be a bit faster to integrate this with emitNodes
# -- unless I need to have an in-memory list that can be used for
# lookups.  But that's best for debugging, so keep it this way for now.
#
# Also Note: the first node is a dummy that can and should be tossed
# now.

sub moveTopToFront($) {
    my ( $firstRef ) = @_;

    my $firstChild = ${$firstRef};
    ${$firstRef} = 0;
    my @lastSub;

    if ( $firstChild > 0 ) {
        # remove the last (the root) subarray
        @lastSub = splice( @gNodes, $firstChild );
    } else {
        die "there should be no words!!" if $gWordCount != 0;
    }
    # remove the first (garbage) node
    shift @gNodes;

    my $diff;
    if ( $firstChild > 0 ) {
        # -1 because all move down by 1; see prev line
        $diff = @lastSub - 1;
        die "something wrong with len\n" if $diff < 0;
    } else {
        $diff = 0;
    }

    # stick it on the front
    splice( @gNodes, 0, 0, @lastSub);

    # We add $diff to everything. There's no subtracting because
    # nobody had any refs to the top list.

    for ( my $i = 0; $i < @gNodes; ++$i ) {
        my $fco = TrieNodeGetFirstChildOffset( $gNodes[$i] );
        if ( $fco != 0 ) {      # 0 means NONE, not 0th!!
            TrieNodeSetFirstChildOffset( \$gNodes[$i], $fco+$diff );
        }
    }
} # moveTopToFront


sub buildNode {
    my ( $depth ) = @_;

    if ( @gCurrentWord == $depth ) {
        # End of word reached. If the next word isn't a continuation
        # of the current one, then we've reached the bottom of the
        # recursion tree.
        readNextWord();
        if ($gFirstDiff < $depth || $gDone) {
            return 0;
        }
    }

    my @newedges;

    do {
        my $letter = $gCurrentWord[$depth];
        my $isTerminal = @gCurrentWord - 1 == $depth ? 1:0;

        my $nodeOffset = buildNode($depth+1);
        my $newNode = MakeTrieNode($letter, $isTerminal, $nodeOffset);
        push( @newedges, $newNode );

    } while ( ($gFirstDiff == $depth) && !$gDone);

    TrieNodeSetIsLastSibling( \@newedges[@newedges-1], 1 );

    return addNodes( \@newedges );
} # buildNode

sub addNodes {
    my ( $newedgesR ) = @_;

    my $found = findSubArray( $newedgesR );

    if ( $found >= 0 ) {
        die "0 is an invalid match!!!" if $found == 0;
        return $found;
    } else {

        my $firstFreeIndex = @gNodes;

        print STDERR "adding...\n" if $debug;
        printNodes( $newedgesR ) if $debug;

        push @gNodes, (@{$newedgesR});

        registerSubArray( $newedgesR, $firstFreeIndex );
        return $firstFreeIndex;
    }
} # addNodes

sub printNode {
    my ( $index, $node ) = @_;

    print STDERR "[$index] ";

    printf( STDERR
            "letter=%d; isTerminal=%d; isLastSib=%d; fco=%d;\n",
            TrieNodeGetLetter($node),
            TrieNodeGetIsTerminal($node),
            TrieNodeGetIsLastSibling($node),
            TrieNodeGetFirstChildOffset($node));
} # printNode

sub printNodes {
    my ( $nodesR, $name ) = @_;

    my $len = @{$nodesR};
    # print "printNodes($name): len = $len\n";

    for ( my $i = 0; $i < $len; ++$i ) {
        my $node = ${$nodesR}[$i];
        printNode( $i, $node );
    }

}


# Hashing.  We'll keep a hash of offsets into the existing nodes
# array, and as the key use a string that represents the entire sub
# array.  Since the key is what we're matching for, there should never
# be more than one value per hash and so we don't need buckets.
# Return -1 if there's no match.

sub findSubArray {
   my ( $newedgesR ) = @_;

	my $key = join('', @{$newedgesR});

    if ( exists( $gSubsHash{$key} ) ) {
        return $gSubsHash{$key};
    } else {
        return -1;
    }
} # findSubArray

# add to the hash
sub registerSubArray {
    my ( $edgesR, $nodeLoc ) = @_;

    my $key = join( '', @{$edgesR} );

    if ( exists $gSubsHash{$key} ) {
        die "entry for key shouldn't exist!!";
    } else {
        $gSubsHash{$key} = $nodeLoc;
    }

} # registerSubArray

sub readNextWord() {
    my @word;

    if ( !$gDone ) {
        $gDone = @gInputStrings == 0;
        if ( !$gDone ) {
            @word = @{shift @gInputStrings};
        } else {
            print STDERR "gDone set to true\n" if $debug;
        }

        print STDERR "got word: ", join(',',@word), "\n" if $debug;
    }
    my $numCommonLetters = 0;
    my $len = @word;
    if ( @gCurrentWord < $len ) {
        $len = @gCurrentWord;
    }

    while ( @gCurrentWord[$numCommonLetters] eq @word[$numCommonLetters]
            && $numCommonLetters < $len) {
        ++$numCommonLetters;
    }

    $gFirstDiff = $numCommonLetters;
    die "words ", join(",",@gCurrentWord), " and ", join(",", @word), " out of order" if #$debug && 
        @gCurrentWord > 0 && @word > 0
        && !firstBeforeSecond( \@gCurrentWord, \@word );
    @gCurrentWord = @word;
} # readNextWord

sub firstBeforeSecond {
    my ( $firstR, $secondR ) = @_;
  
    for ( my $i = 0; ; ++$i ) {

        # if we reach the end of the first word/list, we're done.
        if ( $i == @{$firstR} ) {  
            die "duplicate!!!" if $i == @{$secondR};
            return 1;
            # but if we reach the second end first, we've failed
        } elsif ( $i == @{$secondR} ) {
            return 0;
        }

        my $diff = ${$firstR}[$i] <=> ${$secondR}[$i];

        if ( $diff == 0 ) {
            next;
        } else {
            return $diff < 0;
        }
    }
} # firstBeforeSecond

# passed to sort.  Should remain unprototyped for effeciency's sake

sub cmpWords {

    my $lenA = @{$a};
    my $lenB = @{$b};
    my $min = $lenA > $lenB? $lenB: $lenA;

    for ( my $i = 0; $i < $min; ++$i ) {
        my $ac = ${$a}[$i];
        my $bc = ${$b}[$i];

        my $res = $ac <=> $bc;

        if ( $res != 0 ) {
            return $res;        # we're done
        }
    }

    # If we got here, they match up to their common length.  Longer is
    # greater.
    my $res = @{$a} <=> @{$b};
    return $res; # which is longer?
} # cmpWords

sub parseAndSort() {
    my ( $infile ) = @_;

    my @wordlist;
    my @word;

  WORDLOOP:
    for ( ; ; ) {

        my $dropWord = 0;
        splice @word;           # empty it

        # for each byte
        for ( ; ; ) {
            my $byt = getc($infile);

            if ( $byt eq undef ) {
                last WORDLOOP;
            } elsif ( $byt eq $gTermChar ) {
                if ( !$dropWord ) {
                    push @wordlist, [ @word ];
                    ++$gWordCount;
                }
                next WORDLOOP;
            } elsif ( exists( $gTableHash{$byt} ) ) {
                if ( !$dropWord ) {
                    push @word, $gTableHash{$byt};
                    die "word too long" if @word > 15;
                }
            } elsif ($gKillIfMissing) {
                die "char $byt (", $byt+0, ") not in map file $gTableFile\n";
            } else {
                $dropWord = 1;
                splice @word;     # lose anything we already have
            }
        }
    }

    if ( $gNeedsSort && ($gWordCount > 0)  ) {
        @wordlist = sort cmpWords @wordlist;
    }

    print STDERR "length of list is ", @wordlist + 0, ".\n" if $debug;

    return @wordlist;
} # parseAndSort

# Print binary representation of trie array.  This isn't used yet, but
# eventually it'll want to dump to multiple files appropriate for Palm
# that can be catenated together on other platforms.  There'll need to
# be a file giving the offset of the first node too.  Also, might want
# to move to 4-byte representation when the input can't otherwise be
# handled.

sub dumpNodes {

    for ( my $i = 0; $i < @gNodes; ++$i ) {
        my $node = $gNodes[$i];
        my $bstr = pack( "I", $node );
        print STDOUT $bstr;
    }
}

##############################################################################
# Little node-field setters and getters to hide what bits represent
# what.
##############################################################################

sub TrieNodeSetIsTerminal {
    my ( $nodeR, $isTerminal ) = @_;

    if ( $isTerminal ) {
        ${$nodeR} |= 1 << 31;
    } else {
        ${$nodeR} &= ~(1 << 31);
    }
}

sub TrieNodeGetIsTerminal {
    my ( $node ) = @_;
    return ($node & 1 << 31) != 0;
}

sub TrieNodeSetIsLastSibling {
    my ( $nodeR, $isLastSibling ) = @_;
    if ( $isLastSibling ) {
        ${$nodeR} |= 1 << 30;
    } else {
        ${$nodeR} &= ~(1 << 30);
    }
}

sub TrieNodeGetIsLastSibling {
    my ( $node ) = @_;
    return ($node & 1 << 30) != 0;
}

sub TrieNodeSetLetter {
    my ( $nodeR, $letter ) = @_;

    die "letter ", $letter, " too big" if $letter >= 32;

    my $mask = ~(0x1F << 25);
    ${$nodeR} &= $mask;                         # clear all the bits
    ${$nodeR} |= ($letter << 25);          # set new ones
}

sub TrieNodeGetLetter {
    my ( $node ) = @_;
    $node >>= 25;
    $node &= 0x1F;
    return $node;
}

sub TrieNodeSetFirstChildOffset {
    my ( $nodeR, $fco ) = @_;

    die "$fco larger than 25 bits" if ($fco & 0xFE000000) != 0;

    my $mask = ~0x01FFFFFF;
    ${$nodeR} &= $mask;                   # clear all the bits
    ${$nodeR} |= $fco;                    # set new ones
}

sub TrieNodeGetFirstChildOffset {
    my ( $node ) = @_;
    $node &= 0x01FFFFFF;                  # 24 bits
    return $node;
}


sub MakeTrieNode {
    my ( $letter, $isTerminal, $firstChildOffset, $isLastSibling ) = @_;
    my $result = 0;

    TrieNodeSetIsTerminal( \$result, $isTerminal );
    TrieNodeSetIsLastSibling( \$result, $isLastSibling );
    TrieNodeSetLetter( \$result, $letter );
    TrieNodeSetFirstChildOffset( \$result, $firstChildOffset );

    return $result;
} # MakeTrieNode

# Caller may need to know the offset of the first top-level node.
# Write it here.
sub writeOutStartNode {
    my ( $startNodeOut, $firstRootChildOffset ) = @_;

    open NODEOUT, ">$startNodeOut";
    print NODEOUT pack( "N", $firstRootChildOffset );
    close NODEOUT;
} # writeOutStartNode

# build the hash for translating.  I'm using a hash assuming it'll be
# fast.  Key is the letter; value is the 0..31 value to be output.
sub makeTableHash {
    my $i;
    open TABLEFILE, "< $gTableFile";

    splice @gRevMap;            # empty it

    for ( $i = 0; ; ++$i ) {
        my $ch = getc(TABLEFILE);

        if ( $ch eq undef ) {
            last;
        }

        push @gRevMap, $ch;

        if ( ord($ch) == 0 ) {	# blank
            next;       # we want to increment i when blank seen since 
                        # it is a tile value
        }

        die "$gTableFile too large\n" if $i > 32;
        die "only blank (0) can be 32nd char\n" if ($i == 32 && $ch != 0);

        $gTableHash{$ch} = $i;
    }

    close TABLEFILE;
} # makeTableHash

# emitNodes. "input" is $gNodes.  From it we write up to
# $nBytesPerOutfile to files named $outFileBase0..n, mapping the
# letter field down to 5 bits with a hash built from $tableFile.  If
# at any point we encounter a letter not in the hash we fail with an
# error.

sub emitNodes($$) {
    my ( $gNBytesPerOutfile, $outFileBase ) = @_;

    # now do the emit.

    my $nBytesPerNode;
    # is 17 bits enough?
    if ( @gNodes > 0x1FFFF || $gForceFour ) {
        $nBytesPerNode = 4;
    } else {
        $nBytesPerNode = 3;
    }

    my $nextIndex = 0;
    my $nextFileNum = 0;

    for ( $nextFileNum = 0; ; ++$nextFileNum ) {

        if ( $nextIndex >= @gNodes ) {
            last;	# we're done
        }
        
        die "Too many outfiles; infinite loop?" if $nextFileNum > 99;

        my $outName = sprintf("${outFileBase}_%03d.bin", $nextFileNum);
        open OUTFILE, "> $outName";
        my $curSize = 0;

        while ( $nextIndex < @gNodes ) {

            # scan to find the next terminal
            my $i;
            for ( $i = $nextIndex; 
                  !TrieNodeGetIsLastSibling($gNodes[$i]);
                  ++$i ) {

                # do nothing but a sanity check
                if ( $i >= @gNodes) {
                    die "bad trie format: last node not last sibling" ;
                }

            }
            ++$i;	# move beyond the terminal
            my $nextSize = ($i - $nextIndex) * $nBytesPerNode;
            if ($curSize + $nextSize > $gNBytesPerOutfile) {
                last;
            } else {
                # emit the subarray
                while ( $nextIndex < $i ) {
                    outputNode( $gNodes[$nextIndex], $nBytesPerNode,
                                \*OUTFILE );
                    ++$nextIndex;
                }
                $curSize += $nextSize;
            }
        }

        close OUTFILE;
    }

} # emitNodes

sub printWord {
    my ( $str ) = @_;

    print STDERR "$str\n";
}

# print out the entire dictionary, as text, to STDERR.

sub printOneLevel {

    my ( $index, $str ) = @_;

    for ( ; ; ) {

        my $newStr = $str;
        my $node = $gNodes[$index++];

        my $lindx = $gRevMap[TrieNodeGetLetter($node)];

        if ( ord($lindx) >= 0x20 ) {
            $newStr .= "$lindx";
        } else {
            print STDERR "sub space" if $debug;
            $newStr .= "\\" . chr('0'+$lindx);
        }

        if ( TrieNodeGetIsTerminal($node) ) {
            printWord( $newStr );
        } 

        my $fco = TrieNodeGetFirstChildOffset( $node );
        if ( $fco != 0 ) {
            printOneLevel( $fco, $newStr );
        }

        if ( TrieNodeGetIsLastSibling($node) ) {
            last;
        }
    }
}

sub outputNode {
    my ( $node, $nBytes, $outfile ) = @_;

    # format according to dawg.h:
    # typedef struct array_edge {
    #     unsigned char highByte;
    #     unsigned char lowByte;
    #     unsigned char bits;
    # } array_edge;

    # define LETTERMASK 0x1f
    # define ACCEPTINGMASK 0x20
    # define LASTEDGEMASK 0x40
    # define LASTBITMASK 0x80

    # write the fco (less that one bit).  We want two bytes worth
    # in three-byte mode, and three in four-byte mode (which is
    # untested)
    my $fco = TrieNodeGetFirstChildOffset($node);
    for ( my $i = $nBytes - 2; $i >= 0; --$i ) {
        my $tmp = ($fco >> ($i * 8)) & 0xFF;
        print $outfile pack( "C", $tmp );
    }

    my $bits = 0;

    my $chIn5 = TrieNodeGetLetter($node);

    $bits |= $chIn5;

    if ( TrieNodeGetIsLastSibling($node) ) {
        $bits |= 0x40;
    }
    if ( TrieNodeGetIsTerminal($node) ) {
        $bits |= 0x20;
    }
    if ( ($fco & (0xFFFFFFFF << (($nBytes-1) * 8))) != 0 ) {
        $bits |= 0x80;
    }
    print $outfile pack( "C", $bits );

} # outputNode

sub usage {
    print STDERR "usage: $0 \n"
        . "\t[-b    bytesPerFile] (default = 0xFFFFFFFF)\n"
        . "\t-m     mapFile\n"
        . "\t-ob    outFileBase\n"
        . "\t-sn    start node out file\n"
        . "\t[-if   input file name]  -- default = stdin\n"
        . "\t[-term ch] (word terminator -- default = '\\0'\n"
        . "\t[-nosort] (input already sorted in accord with -m; " .
        " default=sort'\n"
        . "\t[-dump]  (write dictionary as text to STDERR for testing)\n"
        . "\t[-force4](use 4 bytes per node regardless of need)\n"
        . "\t[-r]     (reject words with letters not in mapfile)\n"
        . "\t[-k]     (kill if any letters no in mapfile -- default)\n"
        ;

} # usage

sub parseARGV {

    my $arg;
    while ( my $arg = shift(@ARGV) ) {

      SWITCH: {
          if ($arg =~ /-b/) {$gNBytesPerOutfile = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-m/) {$gTableFile = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-ob/) {$gOutFileBase = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-sn/) {$gStartNodeOut = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-if/) {$gInFileName = shift(@ARGV), last SWITCH;}
          if ($arg =~ /-r/) {$gKillIfMissing = 0; last SWITCH;}
          if ($arg =~ /-k/) {$gKillIfMissing = 1; last SWITCH;}
          if ($arg =~ /-term/) {$gTermChar = chr(shift(@ARGV)); last SWITCH;}
          if ($arg =~ /-dump/) {$gDumpText = 1; last SWITCH;}
          if ($arg =~ /-nosort/) {$gNeedsSort = 0; last SWITCH;}
          if ($arg =~ /-wc/) {$gCountFile = shift(@ARGV); last SWITCH;}
          if ($arg =~ /-force4/) {$gForceFour = 1; last SWITCH;}
          die "unexpected arg $arg\n";
      }
    }


    print STDERR "gNBytesPerOutfile=$gNBytesPerOutfile\n" if $debug;
    print STDERR "gTableFile=$gTableFile\n" if $debug;
    print STDERR "gOutFileBase=$gOutFileBase\n" if $debug;
    print STDERR "gStartNodeOut=$gStartNodeOut\n" if $debug;
    printf STDERR "gTermChar=%s(%d)\n", $gTermChar, ord($gTermChar) if $debug;

    return $gTableFile;
    
} # parseARGV