```#define VECTREX 1

#include "muldiv.h"

// vectrex random:
unsigned int16_t _x=12, _a=34, _b=56, _c=78;

unsigned int16_t vrandom(void) {
_x++; _a = (_a^_c^_x); _b = (_b+_a);
return _c = ((_c+(_b>>1))^_a);
}

unsigned int32_t random32(void) {
return (((unsigned int32_t)vrandom() << 24) ^ ((unsigned int32_t)vrandom() << 16) ^ ((unsigned int32_t)vrandom() << 8) ^ (unsigned int32_t)vrandom());
}

// =====================================================================================
//                                    DIVIDING
// =====================================================================================

/* function to divide arbitrary unsigned 8 bit numerator by unsigned 8 bit divisor */
unsigned int8_t udiv8_by_8(unsigned int8_t num, unsigned int8_t denom) {
unsigned int16_t temp;
unsigned int8_t rem;
static const void *lab = {
&&d0,     &&d1,     &&d2,     &&d3,     &&d4,     &&d5,     &&d6,     &&d7,
&&d8,     &&d9,     &&d10,    &&d11,    &&d12,    &&d13,    &&d14,    &&d15,
&&d16,    &&d17,    &&d18,    &&d19,    &&d20,    &&d21,    &&d22,    &&d23,
&&d24,    &&d25,    &&d26,    &&d27,    &&d28,    &&d29,    &&d30,    &&d31,
&&d32,    &&d33,    &&d34,    &&d35,    &&d36,    &&d37,    &&d38,    &&d39,
&&d40,    &&d41,    &&d42,    &&d43,    &&d44,    &&d45,    &&d46,    &&d47,
&&d48,    &&d49,    &&d50,    &&d51,    &&d52,    &&d53,    &&d54,    &&d55,
&&d56,    &&d57,    &&d58,    &&d59,    &&d60,    &&d61,    &&d62,    &&d63,
&&d64,    &&d65,    &&d66,    &&d67,    &&d68,    &&d69,    &&d70,    &&d71,
&&d72,    &&d73,    &&d74,    &&d75,    &&d76,    &&d77,    &&d78,    &&d79,
&&d80,    &&d81,    &&d82,    &&d83,    &&d84,    &&d85,    &&dd,     &&dd,
&&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,
&&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,
&&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,
&&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,
&&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,     &&dd,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
&&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d, &&d,
};
goto *lab[denom];
d: return num >= denom ? 1U : 0U;

dd: return num < denom ? 0U : (num < (denom<<1) ? 1U : 2U);

d0: return 255U; // User needs to supply some sort of error handler.
d1: return num;
d2:  return div2(num);   d3:  return div3(num);   d4:  return div4(num);   d5:  return div5(num);
d6:  return div6(num);   d7:  return div7(num);   d8:  return div8(num);   d9:  return div9(num);
d10: return div10(num);  d11: return div11(num);  d12: return div12(num);  d13: return div13(num);
d14: return div14(num);  d15: return div15(num);  d16: return div16(num);  d17: return div17(num);
d18: return div18(num);  d19: return div19(num);  d20: return div20(num);  d21: return div21(num);
d22: return div22(num);  d23: return div23(num);  d24: return div24(num);  d25: return div25(num);
d26: return div26(num);  d27: return div27(num);  d28: return div28(num);  d29: return div29(num);
d30: return div30(num);  d31: return div31(num);  d32: return div32(num);  d33: return div33(num);
d34: return div34(num);  d35: return div35(num);  d36: return div36(num);  d37: return div37(num);
d38: return div38(num);  d39: return div39(num);  d40: return div40(num);  d41: return div41(num);
d42: return div42(num);  d43: return div43(num);  d44: return div44(num);  d45: return div45(num);
d46: return div46(num);  d47: return div47(num);  d48: return div48(num);  d49: return div49(num);
d50: return div50(num);  d51: return div51(num);  d52: return div52(num);  d53: return div53(num);
d54: return div54(num);  d55: return div55(num);  d56: return div56(num);  d57: return div57(num);
d58: return div58(num);  d59: return div59(num);  d60: return div60(num);  d61: return div61(num);
d62: return div62(num);  d63: return div63(num);  d64: return div64(num);  d65: return div65(num);
d66: return div66(num);  d67: return div67(num);  d68: return div68(num);  d69: return div69(num);
d70: return div70(num);  d71: return div71(num);  d72: return div72(num);  d73: return div73(num);
d74: return div74(num);  d75: return div75(num);  d76: return div76(num);  d77: return div77(num);
d78: return div78(num);  d79: return div79(num);  d80: return div80(num);  d81: return div81(num);
d82: return div82(num);  d83: return div83(num);  d84: return div84(num);  d85: return div85(num);
}

// this division code started off as a simple long division shift-and-subtract taken
// from the "C" section of: https://gtoal.com/vectrex/vectrex-muldiv.c.html
// Speeded up by using a binary chop to detect where to subtract from.

unsigned int32_t div_core32(unsigned int32_t numerator, unsigned int32_t denominator, int8_t max_shift) {
unsigned int32_t quotient, shifted;
int8_t shift, min_shift;
#define DIV_BODY                                                                                    \
quotient = 0LL;                                                                                   \
for (;;) {                                                                                        \
if (numerator < denominator) break; /* < 1.0 (ie 0) */                                          \
min_shift = 0;                                                                                  \
for (;;) {                                                                                      \
shift = (min_shift+max_shift)/2;                                                              \
shifted = denominator<<(int32_t)shift;                                                        \
/* The above and below is why the data type used *must* match the 'max_shift' given width!  */\
if ((shifted>>(int32_t)shift) != denominator) {          /* shift causes overflow? */         \
max_shift = shift; /* (exclusive upper bound, remember?) */                                 \
} else {                                                                                      \
if (numerator < shifted) max_shift = shift; else min_shift = shift;                         \
if (min_shift+1==max_shift) break;                                                          \
}                                                                                             \
}                                                                                               \
shift = max_shift = min_shift; /* max shift is remembered for next upper bound */               \
if (numerator < (denominator<<shift)) {                                                         \
quotient += 1UL<<(int16_t)shift; /* remainder is screwed up but we don't care about that for now */ \
break; /* would go negative? */                                                               \
}                                                                                               \
numerator -= denominator<<shift; /* can't go negative */                                        \
quotient += 1UL<<(int16_t)shift;                                                                \
}                                                                                                 \
return quotient;    /* remainder = -numerator; */
DIV_BODY
}

// Not efficient to use 32 bit variables for 16-bit calculations on the 8-bit Vectrex... so:
unsigned int16_t div_core16(unsigned int16_t numerator, unsigned int16_t denominator, int8_t max_shift) {
unsigned int16_t quotient, shifted;
int8_t shift, min_shift;
DIV_BODY // C++ users, consider this a poor man's template.
}

/* In header as static inlines:
unsigned int32_t udiv32_by_32(unsigned int32_t numerator, unsigned int32_t denominator);
unsigned int32_t udiv32_by_16(unsigned int32_t numerator, unsigned int16_t denominator);
unsigned int32_t udiv32_by_8(unsigned int32_t numerator, unsigned int denominator);
unsigned int16_t udiv16_by_16(unsigned int16_t numerator, unsigned int16_t denominator);
unsigned int16_t udiv16_by_8(unsigned int16_t numerator, unsigned int denominator);
unsigned int udiv8_by_8(unsigned int numerator, unsigned int denominator);
*/

int32_t sdiv32_by_32(int32_t numerator, int32_t denominator) {
int sign = 1;
if (denominator == 0) return(int32_t)( numerator < 0 ? 0x80000000LL /*MIN_SIGNED(int32_t, 32)*/ : 0x7FFFFFFFLL /*MAX_SIGNED(int32_t, 32)*/);     // remainder = 0;
if ((numerator == (int32_t)0x80000000LL /*MIN_SIGNED(int32_t, 32)*/) && (denominator == -1)) return 0x7FFFFFFFLL /*MAX_SIGNED(int32_t, 32)*/;  // remainder = 0;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int32_t)div_core32((unsigned int32_t)numerator,(unsigned int32_t)denominator,32) * (int32_t)sign;
}

int32_t sdiv32_by_16(int32_t numerator, int16_t denominator) {
int sign = 1;
if (denominator == 0) return (int32_t)( numerator < 0 ? 0x80000000LL /*MIN_SIGNED(int32_t, 32)*/
: 0x7FFFFFFFLL /*MAX_SIGNED(int32_t, 32)*/);     // remainder = 0;
if ((numerator == (int32_t)0x80000000LL /*MIN_SIGNED(int32_t, 32)*/) && (denominator == -1)) return 0x7FFFFFFFLL /*MAX_SIGNED(int32_t, 32)*/;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int32_t)div_core32((unsigned int32_t)numerator,(unsigned int32_t)denominator,32) * (int32_t)sign;
}

int32_t sdiv32_by_8(int32_t numerator, int8_t denominator) {
int sign = 1;
if (denominator == 0) return (int32_t)(numerator < 0 ? 0x80000000LL /*MIN_SIGNED(int32_t, 32)*/ : 0x7FFFFFFFLL /*MAX_SIGNED(int32_t, 32)*/);     // remainder = 0;
if ((numerator == (int32_t)0x80000000LL /*MIN_SIGNED(int32_t, 32)*/) && (denominator == -1)) return 0x7FFFFFFFLL/*MAX_SIGNED(int32_t, 32)*/;  // remainder = 0;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int32_t)div_core32((unsigned int32_t)numerator,(unsigned int32_t)denominator,32) * (int32_t)sign;
}

int16_t sdiv16_by_16(int16_t numerator, int16_t denominator) {
int sign = 1;
if (denominator == 0) return (int16_t)(numerator < 0 ? 0x8000 /*MIN_SIGNED(int16_t, 16)*/ : 0x7FFF /*MAX_SIGNED(int16_t, 16)*/);     // remainder = 0;
if ((numerator == (int16_t)0x8000 /*MIN_SIGNED(int16_t, 16)*/) && (denominator == -1)) return 0x7FFF /*MAX_SIGNED(int16_t, 16)*/;  // remainder = 0;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int16_t)div_core16((unsigned int16_t)numerator,(unsigned int16_t)denominator,16) * (int16_t)sign;
}

int16_t sdiv16_by_8(int16_t numerator, int8_t denominator) {
int sign = 1;
if (denominator == 0) return (int16_t)(numerator < 0 ? 0x8000 /*MIN_SIGNED(int16_t, 16)*/ : 0x7FFF /*MAX_SIGNED(int16_t, 16)*/);     // remainder = 0;
if ((numerator == (int16_t)0x8000 /*MIN_SIGNED(int16_t, 16)*/) && (denominator == -1)) return 0x7FFF /*MAX_SIGNED(int16_t, 16)*/;  // remainder = 0;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int16_t)div_core16((unsigned int16_t)numerator,(unsigned int16_t)denominator,16) * (int16_t)sign;
}

int8_t sdiv8_by_8(int8_t numerator, int8_t denominator) {
int sign = 1;
if (denominator == 0) return (int8_t)(numerator < 0 ? 0x80 /*MIN_SIGNED(int, 8)*/ : 0x7F /*MAX_SIGNED(int, 8)*/);     // remainder = 0;
if ((numerator == (int8_t)0x80 /*MIN_SIGNED(int, 8)*/) && (denominator == -1)) return 0x7F /*MAX_SIGNED(int, 8)*/;  // remainder = 0;
if (numerator < 0) { sign = -1; numerator = -numerator;}
if (denominator < 0) { sign = -sign; denominator = -denominator;}
return (int8_t)udiv8_by_8((unsigned int8_t)numerator,(unsigned int8_t)denominator)*sign;
}

#ifdef DBMAIN

#ifdef VECTREX
void crash(int reason) {
for (;;) {
}
}
#define TESTS 1000L
#else
#include <time.h>
#define TESTS 10000000L
#endif

int main(int argc, char **argv) {
#ifndef VECTREX
clock_t t;
double base_time, time_taken, original_time, my_time;
#endif

int32_t i;
int32_t dividend, divisor, result;
int32_t cksum;

// default range is everything.

for (i=0; i < TESTS; i++) {
// should force more randomicity in the number of bits used. Half will be 31 bits long. Decreasingly few shorter ones.
dividend = random32()&MAX_SIGNED(int32_t, 32);
do {divisor = random32()&MAX_SIGNED(int32_t, 32); } while (divisor==0);
result = sdiv32_by_32(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_32(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(1);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_32: %ld randomised signed 32-bit by 32-bit tests passed!\n", TESTS);
#endif

for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned int32_t, 32);
do { divisor = random32()&MAX_UNSIGNED(unsigned int32_t, 32); } while (divisor==0);
result = udiv32_by_32(dividend, divisor);
if ((unsigned int32_t)dividend/(unsigned int32_t)divisor != (unsigned int32_t)result) {
#ifndef VECTREX
fprintf(stdout, "udiv32_by_32(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", (unsigned int32_t)dividend, (unsigned int32_t)divisor, (unsigned int32_t)result, (unsigned int32_t)dividend, (unsigned int32_t)divisor, (unsigned int32_t)dividend/(unsigned int32_t)divisor);
exit(1);
#else
crash(2);
#endif
}
}

#ifndef VECTREX
fprintf(stdout, "udiv32_by_32: %ld randomised unsigned 32-bit by 32-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_SIGNED(int32_t, 32);
do { divisor = random32()&MAX_SIGNED(int32_t, 16); } while (divisor==0);
result = sdiv32_by_16(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_16(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(3);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_16: %ld randomised signed 32-bit by 16-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned int32_t, 32);
do { divisor = random32()&MAX_UNSIGNED(unsigned int16_t, 16); } while (divisor==0);
result = udiv32_by_16(dividend, divisor);
if ((unsigned int32_t)dividend/(unsigned int16_t)divisor != (unsigned int32_t)result) {
#ifndef VECTREX
fprintf(stdout, "udiv32_by_16(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(4);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "udiv32_by_16: %ld randomised unsigned 32-bit by 8-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_SIGNED(int32_t, 32);
do { divisor = random32()&MAX_SIGNED(int8_t, 8); } while (divisor==0);
result = sdiv32_by_8(dividend, divisor);
if ((int32_t)dividend/(int8_t)divisor != (int32_t)result) {
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_16(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(5);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv32_by_8: %ld randomised signed 32-bit by 8-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned int32_t, 32);
do { divisor = random32()&MAX_UNSIGNED(unsigned int8_t, 8); } while (divisor==0);
result = udiv32_by_8(dividend, divisor);
if ((unsigned int32_t)dividend/(unsigned int8_t)divisor != (unsigned int32_t)result) {
#ifndef VECTREX
fprintf(stdout, "udiv32_by_16(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(6);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "udiv32_by_8: %ld randomised unsigned 32-bit by 16-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_SIGNED(int32_t, 16);
do { divisor = random32()&MAX_SIGNED(int32_t, 16); } while (divisor==0);
result = sdiv16_by_16(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "sdiv16_by_16(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(7);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv16_by_16: %ld randomised signed 16-bit by 16-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned int32_t, 16);
do { divisor = random32()&MAX_UNSIGNED(unsigned int32_t, 16); } while (divisor==0);
result = udiv16_by_16(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "udiv16_by_16(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(8);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "udiv16_by_16: %ld randomised unsigned 16-bit by 16-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_SIGNED(int32_t, 16);
do { divisor = random32()&MAX_SIGNED(int32_t, 8); } while (divisor==0);
result = sdiv16_by_8(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "sdiv16_by_8(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(9);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv16_by_8: %ld randomised signed 16-bit by 8-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned int32_t, 16);
do { divisor = random32()&MAX_UNSIGNED(unsigned int32_t, 8); } while (divisor==0);
result = udiv16_by_8(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "udiv16_by_8(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(10);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "udiv16_by_8: %ld randomised unsigned 16-bit by 8-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_SIGNED(int32_t, 8);
do { divisor = random32()&MAX_SIGNED(int32_t, 8); } while (divisor==0);
result = sdiv8_by_8(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "sdiv8_by_8(%ld,%ld) = %ld but %ld/%ld = %ld ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(11);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "sdiv8_by_8: %ld randomised  signed 8-bit by 8-bit tests passed!\n", TESTS);
#endif
for (i=0; i < TESTS; i++) {
dividend = random32()&MAX_UNSIGNED(unsigned char, 8);
do { divisor = random32()&MAX_UNSIGNED(unsigned char, 8); } while (divisor==0);
result = udiv8_by_8(dividend, divisor);
if (dividend/divisor != result) {
#ifndef VECTREX
fprintf(stdout, "udiv8_by_8(%lu,%lu) = %lu but %lu/%lu = %lu ?\n", dividend, divisor, result, dividend, divisor, dividend/divisor);
exit(1);
#else
crash(12);
#endif
}
}
#ifndef VECTREX
fprintf(stdout, "udiv8_by_8: %ld randomised unsigned 8-bit by 8-bit tests passed!\n", TESTS);
#endif
unsigned char num, den;
num = 0;
for (;;) {
den = 1;
for (;;) {
if (num/den != udiv8_by_8(num,den)) {
#ifndef VECTREX
fprintf(stdout, "udiv8_by_8(%d,%d) = %d but %d/%d = %d ?\n", num, den, udiv8_by_8(num,den), num, den, num/den);
exit(1);
#else
crash(13);
#endif
}
den += 1;
if (den == 0) break;
}
num += 1;
if (num == 0) break;
}
#ifndef VECTREX
fprintf(stdout, "udiv8_by_8: Complete 256x256 unsigned 8-bit tests passed!\n");
#endif

return 0;
}
#endif

#define umul8(x,y) ((x)*(y))

// =====================================================================================
//                                 MULTIPLYING
// =====================================================================================

// This first implementation is based on:
//     https://stackoverflow.com/questions/22845801/32-bit-signed-integer-multiplication-without-using-64-bit-data-type/22847373#22847373
// with some minimal tweaking to supply two different word sizes...

/* compute the full 32-bit product of two signed 16-bit integers */
unsigned int32_t umultiply16 (unsigned int16_t a, unsigned int16_t b) {
/* split operands into halves */
unsigned int8_t al = (unsigned int8_t)a;
unsigned int8_t ah = (unsigned int8_t)(a >> 8);
unsigned int8_t bl = (unsigned int8_t)b;
unsigned int8_t bh = (unsigned int8_t)(b >> 8);
unsigned int16_t p0 = umul8(al, bl);   /* compute partial products using native 8x8 -> 16 MUL */
unsigned int16_t p1 = umul8(al, bh);
unsigned int16_t p2 = umul8(ah, bl);
unsigned int16_t p3 = umul8(ah, bh);
unsigned int16_t cy = ((p0 >> 8) + (unsigned int8_t)p1 + (unsigned int8_t)p2) >> 8;
/* sum partial products for high */
unsigned int16_t umul16hi = p3 + (p2 >> 8) + (p1 >> 8) + cy; /* compute the upper 16 bits of the product of two unsigned 16-bit integers */
//unsigned int16_t umul16lo = p0 + (p2 << 8) + (p1 << 8);
unsigned int16_t umul16lo = ((p2 + p1) << 8) + p0;
return (unsigned int32_t)(((unsigned int32_t)umul16hi<<(unsigned int32_t)16) | (unsigned int32_t)umul16lo);  /* bits <31:16> of the product a * b */
}

int32_t multiply16 (int16_t a, int16_t b) {
/* split operands into halves */
unsigned int8_t al = (unsigned int8_t)a;
unsigned int8_t ah = (unsigned int8_t)(a >> 8);
unsigned int8_t bl = (unsigned int8_t)b;
unsigned int8_t bh = (unsigned int8_t)(b >> 8);
unsigned int16_t p0 = umul8(al, bl);   /* compute partial products using native 8x8 -> 16 MUL */
unsigned int16_t p1 = umul8(al, bh);
unsigned int16_t p2 = umul8(ah, bl);
unsigned int16_t p3 = umul8(ah, bh);
unsigned int16_t cy = ((p0 >> 8) + (unsigned int8_t)p1 + (unsigned int8_t)p2) >> 8;
/* sum partial products for high */
unsigned int16_t umul16hi = (unsigned int16_t)(p3 + (p2 >> 8) + (p1 >> 8) + cy); /* compute the upper 16 bits of the product of two unsigned 16-bit integers */
int16_t mul16hi = (int16_t)(umul16hi - (unsigned int16_t)(((a < 0) ? b : 0) - ((b < 0) ? a : 0))); /* compute the upper 16 bits of the product of two signed 16-bit integers */
/* sum partial products for low */
//unsigned int16_t umul16lo = p0 + (p2 << 8) + (p1 << 8);
unsigned int16_t umul16lo = ((p2 + p1) << 8) + p0;
return (int32_t)(((int32_t)mul16hi<<(int32_t)16) | (int32_t)((unsigned int32_t)umul16lo));  /* bits <31:16> of the product a * b */
}

// -----------

#ifndef VECTREX
/* compute the full 64-bit product of two signed 32-bit integers */
int64_t multiply32(int32_t a, int32_t b) {
/* split operands into halves */
unsigned int16_t al = (unsigned int16_t)a;
unsigned int16_t ah = a >> 16;
unsigned int16_t bl = (unsigned int16_t)b;
unsigned int16_t bh = b >> 16;
/* compute partial products */
unsigned int32_t p0 = umultiply16(al, bl);
unsigned int32_t p1 = umultiply16(al, bh);
unsigned int32_t p2 = umultiply16(ah, bl);
unsigned int32_t p3 = umultiply16(ah, bh);
/* sum partial products */
unsigned int32_t cy = ((p0 >> 16) + (unsigned int16_t)p1 + (unsigned int16_t)p2) >> 16;
unsigned int32_t umul32hi = p3 + (p2 >> 16) + (p1 >> 16) + cy; /* compute the upper 32 bits of the product of two unsigned 32-bit integers */
int32_t mul32hi = umul32hi - ((a < 0) ? b : 0) - ((b < 0) ? a : 0); /* compute the upper 32 bits of the product of two signed 32-bit integers */
/* sum partial products for low */
//unsigned int32_t umul32lo = p0 + (p2 << 16) + (p1 << 16);
unsigned int32_t umul32lo = ((p2 + p1) << 16) + p0;
return (int64_t)mul32hi<<32LL | (int64_t)((unsigned int64_t)umul32lo); // requires native 32x32 -> lower-32 multiply.  (16x16 does not work)
}
#endif

// =====================================================================================

// The next implementation is based on:
//    https://www.techiedelight.com/multiply-16-bit-integers-using-8-bit-multiplier/
// although it required a lot more hacking to fit our use-case...

unsigned int32_t umultiply16bit(unsigned int16_t m, unsigned int16_t n) {
unsigned int8_t mLow = (unsigned int8_t)m;  	// stores first 8-bits of m
unsigned int8_t mHigh = (unsigned int8_t)(m>>8);	// stores last 8-bits of m

unsigned int8_t nLow = (unsigned int8_t)n;  	// stores first 8-bits of n
unsigned int8_t nHigh = (unsigned int8_t)(n>>8);	// stores last 8-bits of n

unsigned int16_t mLow_nLow   = (unsigned int16_t)umul8(mLow,  nLow); // native 8x8 -> 16 MULs
unsigned int16_t mHigh_nLow  = (unsigned int16_t)umul8(mHigh, nLow);
unsigned int16_t mLow_nHigh  = (unsigned int16_t)umul8(mLow,  nHigh);
unsigned int16_t mHigh_nHigh = (unsigned int16_t)umul8(mHigh, nHigh);

// return mLow_nLow + ((mHigh_nLow + mLow_nHigh) << 8L) + (mHigh_nHigh << 16L);
// -> return (mHigh_nHigh << 16L) + ((mHigh_nLow + mLow_nHigh) << 8L) + mLow_nLow;
return (((mHigh_nHigh << 8L) + (mHigh_nLow + mLow_nHigh)) << 8L) + mLow_nLow;
}

int32_t multiply16bit(int16_t m, int16_t n) {
int sign = 1;

if (m < 0) {m = -m; sign = -sign;}
if (n < 0) {n = -n; sign = -sign;}

if (sign < 0) {
return -(int32_t)umultiply16bit((unsigned int16_t)m,(unsigned int16_t)n);
} else {
return (int32_t)umultiply16bit((unsigned int16_t)m,(unsigned int16_t)n);
}
}

// -----------
#ifndef VECTREX
static unsigned int64_t umultiply32bit(unsigned int32_t m, unsigned int32_t n) {
// could use these 4 as parameters instead of 2 int32_t's:
unsigned int16_t mLow =  m;  		// stores first 8-bits of m
unsigned int16_t mHigh = m >> 16;	// stores last 8-bits of m

unsigned int16_t nLow = n;  		// stores first 8-bits of n
unsigned int16_t nHigh = n >> 16;	// stores last 8-bits of n

unsigned int32_t mLow_nLow = umultiply16bit(mLow, nLow);
unsigned int32_t mHigh_nLow = umultiply16bit(mHigh, nLow);
unsigned int32_t mLow_nHigh = umultiply16bit(mLow, nHigh);
unsigned int32_t mHigh_nHigh = umultiply16bit(mHigh, nHigh);
unsigned int32_t LOW, LOW_LOW, SUM, CARRY;

SUM = mHigh_nLow + mLow_nHigh;
LOW_LOW  = SUM << 16L;
CARRY = 0; LOW = mLow_nLow+LOW_LOW; if (LOW < mLow_nLow || LOW < LOW_LOW) CARRY = 1;

return ((unsigned int64_t) ((SUM >> 16L)+mHigh_nHigh+CARRY) << 32LL) | LOW;
}

static int64_t multiply32bit(int32_t m, int32_t n) { // for Vectrex, will need to pass in parameters as multiple 16-bit ints (Q16.16 format)
int sign = 1;
// will fail if both params are largest negative integer.  I don't care.
if (m < 0) {m = -m; sign = -sign;}
if (n < 0) {n = -n; sign = -sign;}

if (sign < 0) {
return -umultiply32bit(m,n);
} else {
return umultiply32bit(m,n);
}
}
#endif
```