#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define NO_HARDWARE_MUL 1
#ifdef NO_HARDWARE_MUL
// Multiply for 8/16 bit processor with no MUL instruction, eg 6502...
// Not needed on 6809
// generate table of squares or quarter-squares with iteration!
// a live program would put this table into a const array in ROM
uint16_t square[256]; // = { 0,1,4,9,16,25, ..., 65025};
uint16_t squareqtr[256*2]; // = { 0,1,4,9,16,25, ..., 65025};
void init_sq(const int MAX) {
uint32_t step, i, prev, tot;
step = i = prev = tot = 0;
do { // derived from table of differences.
tot += step;
if (i < 256) square[i] = tot+prev; // i^2
squareqtr[i] = (tot+prev)>>2U; // (i^2)/4
//fprintf(stdout, "sq4[%d]=%d\n",i,squareqtr[i]);
prev = tot;
i += 1; step += 1;
} while (i < MAX);
}
static uint16_t umul8add(uint8_t X, uint8_t Y) {
uint8_t pow_of_2;
uint16_t res;
if (X > Y) {
uint8_t swap = X; X = Y; Y = swap;
} else if (X == Y) return square[X];
if (X == 0) return 0;
pow_of_2 = 1; res = Y;
for (;;) {
if (X == pow_of_2) return res;
res = res<<1;
pow_of_2 = pow_of_2<<1; // 0 comes after 128 because uint8_t
if (pow_of_2 == 0) break;
}
//if (X == 1) return Y;
//if (X == 2) return Y<<1;
//if (X == 4) return Y<<2;
//if (X == 8) return Y<<3;
//if (X == 16) return Y<<4;
//if (X == 32) return Y<<5;
//if (X == 64) return Y<<6;
//if (X == 128) return Y<<7;
// derived from formula: X * Y == X * (X + (Y-X)) == X^2 + X * (Y-X)
return umul8add(X, Y-X) + square[X]; // not efficient for small X, big Y.
}
static inline uint16_t umul8qtr(uint8_t X, uint8_t Y) {
return squareqtr[X+Y] - (X < Y ? squareqtr[Y-X] : squareqtr[X-Y]); // very efficient
}
//#define umul8(x,y) umul8add(x,y)
#define umul8(x,y) umul8qtr(x,y)
#else
#define umul8(x,y) ((x)*(y))
#endif
// =====================================================================================
// This first implementation is based on:
// https://stackoverflow.com/questions/22845801/32-bit-signed-integer-multiplication-without-using-64-bit-data-type/22847373#22847373
// with some minimal tweaking to supply two different word sizes...
/* compute the full 32-bit product of two signed 16-bit integers */
uint32_t umultiply16 (uint16_t a, uint16_t b) {
/* split operands into halves */
uint8_t al = (uint8_t)a;
uint8_t ah = a >> 8;
uint8_t bl = (uint8_t)b;
uint8_t bh = b >> 8;
uint16_t p0 = umul8(al, bl); /* compute partial products using native 8x8 -> 16 MUL */
uint16_t p1 = umul8(al, bh);
uint16_t p2 = umul8(ah, bl);
uint16_t p3 = umul8(ah, bh);
uint16_t cy = ((p0 >> 8) + (uint8_t)p1 + (uint8_t)p2) >> 8;
/* sum partial products for high */
uint16_t umul16hi = p3 + (p2 >> 8) + (p1 >> 8) + cy; /* compute the upper 16 bits of the product of two unsigned 16-bit integers */
//uint16_t umul16lo = p0 + (p2 << 8) + (p1 << 8);
uint16_t umul16lo = ((p2 + p1) << 8) + p0;
return (uint32_t)(((uint32_t)umul16hi<<(uint32_t)16) | (uint32_t)umul16lo); /* bits <31:16> of the product a * b */
}
int32_t multiply16 (int16_t a, int16_t b) {
/* split operands into halves */
uint8_t al = (uint8_t)a;
uint8_t ah = a >> 8;
uint8_t bl = (uint8_t)b;
uint8_t bh = b >> 8;
uint16_t p0 = umul8(al, bl); /* compute partial products using native 8x8 -> 16 MUL */
uint16_t p1 = umul8(al, bh);
uint16_t p2 = umul8(ah, bl);
uint16_t p3 = umul8(ah, bh);
uint16_t cy = ((p0 >> 8) + (uint8_t)p1 + (uint8_t)p2) >> 8;
/* sum partial products for high */
uint16_t umul16hi = p3 + (p2 >> 8) + (p1 >> 8) + cy; /* compute the upper 16 bits of the product of two unsigned 16-bit integers */
int16_t mul16hi = umul16hi - ((a < 0) ? b : 0) - ((b < 0) ? a : 0); /* compute the upper 16 bits of the product of two signed 16-bit integers */
/* sum partial products for low */
//uint16_t umul16lo = p0 + (p2 << 8) + (p1 << 8);
uint16_t umul16lo = ((p2 + p1) << 8) + p0;
return (int32_t)(((int32_t)mul16hi<<(int32_t)16) | (int32_t)((uint32_t)umul16lo)); /* bits <31:16> of the product a * b */
}
// -----------
/* compute the full 64-bit product of two signed 32-bit integers */
int64_t multiply32(int32_t a, int32_t b) {
/* split operands into halves */
uint16_t al = (uint16_t)a;
uint16_t ah = a >> 16;
uint16_t bl = (uint16_t)b;
uint16_t bh = b >> 16;
/* compute partial products */
uint32_t p0 = umultiply16(al, bl); // Could use umultiply16 here, if we hadn't already merged it with signed mult...
uint32_t p1 = umultiply16(al, bh);
uint32_t p2 = umultiply16(ah, bl);
uint32_t p3 = umultiply16(ah, bh);
/* sum partial products */
uint32_t cy = ((p0 >> 16) + (uint16_t)p1 + (uint16_t)p2) >> 16;
uint32_t umul32hi = p3 + (p2 >> 16) + (p1 >> 16) + cy; /* compute the upper 32 bits of the product of two unsigned 32-bit integers */
int32_t mul32hi = umul32hi - ((a < 0) ? b : 0) - ((b < 0) ? a : 0); /* compute the upper 32 bits of the product of two signed 32-bit integers */
/* sum partial products for low */
//uint32_t umul32lo = p0 + (p2 << 16) + (p1 << 16);
uint32_t umul32lo = ((p2 + p1) << 16) + p0;
return (int64_t)mul32hi<<32 | (int64_t)((uint64_t)umul32lo);
// requires native 32x32 -> lower-32 multiply. (16x16 does not work)
}
// =====================================================================================
// The next implementation is based on:
// https://www.techiedelight.com/multiply-16-bit-integers-using-8-bit-multiplier/
// although it required a lot more hacking to fit our use-case...
static uint32_t umultiply16bit(uint16_t m, uint16_t n) {
uint8_t mLow = m; // stores first 8-bits of m
uint8_t mHigh = m>>8; // stores last 8-bits of m
uint8_t nLow = n; // stores first 8-bits of n
uint8_t nHigh = n>>8; // stores last 8-bits of n
uint16_t mLow_nLow = umul8(mLow, nLow); // native 8x8 -> 16 MULs
uint16_t mHigh_nLow = umul8(mHigh, nLow);
uint16_t mLow_nHigh = umul8(mLow, nHigh);
uint16_t mHigh_nHigh = umul8(mHigh, nHigh);
// return mLow_nLow + ((mHigh_nLow + mLow_nHigh) << 8L) + (mHigh_nHigh << 16L);
// -> return (mHigh_nHigh << 16L) + ((mHigh_nLow + mLow_nHigh) << 8L) + mLow_nLow;
return (((mHigh_nHigh << 8L) + (mHigh_nLow + mLow_nHigh)) << 8L) + mLow_nLow;
}
static int32_t multiply16bit(int16_t m, int16_t n) {
int sign = 1;
if (m < 0) {m = -m; sign = -sign;}
if (n < 0) {n = -n; sign = -sign;}
if (sign < 0) {
return -umultiply16bit(m,n);
} else {
return umultiply16bit(m,n);
}
}
// -----------
static uint64_t umultiply32bit(uint32_t m, uint32_t n) {
// could use these 4 as parameters instead of 2 longs:
uint16_t mLow = m; // stores first 8-bits of m
uint16_t mHigh = m >> 16; // stores last 8-bits of m
uint16_t nLow = n; // stores first 8-bits of n
uint16_t nHigh = n >> 16; // stores last 8-bits of n
uint32_t mLow_nLow = umultiply16bit(mLow, nLow);
uint32_t mHigh_nLow = umultiply16bit(mHigh, nLow);
uint32_t mLow_nHigh = umultiply16bit(mLow, nHigh);
uint32_t mHigh_nHigh = umultiply16bit(mHigh, nHigh);
uint32_t LOW, LOW_LOW, SUM, CARRY;
SUM = mHigh_nLow + mLow_nHigh;
LOW_LOW = SUM << 16L;
CARRY = 0; LOW = mLow_nLow+LOW_LOW; if (LOW < mLow_nLow || LOW < LOW_LOW) CARRY = 1;
return ((uint64_t) ((SUM >> 16L)+mHigh_nHigh+CARRY) << 32LL) | LOW;
}
static int64_t multiply32bit(int32_t m, int32_t n) { // for Vectrex, will need to pass in parameters as multiple 16-bit ints (Q16.16 format)
int sign = 1;
// will fail if both params are largest negative integer. I don't care.
if (m < 0) {m = -m; sign = -sign;}
if (n < 0) {n = -n; sign = -sign;}
if (sign < 0) {
return -umultiply32bit(m,n);
} else {
return umultiply32bit(m,n);
}
}
// =====================================================================================
static inline uint8_t msb8(register uint8_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
return(x & ~(x >> 1));
}
static inline uint16_t msb16(register uint16_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
return(x & ~(x >> 1));
}
static inline uint32_t msb32(register uint32_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
return(x & ~(x >> 1));
}
static inline uint64_t msb64(register uint64_t x)
{
x |= (x >> 1);
x |= (x >> 2);
x |= (x >> 4);
x |= (x >> 8);
x |= (x >> 16);
x |= (x >> 32);
return(x & ~(x >> 1));
}
static uint16_t udiv16(uint16_t qq, uint16_t d) {
uint32_t x, q=qq;
uint16_t b, res, t;
t = msb16(d);
//fprintf(stdout, "q: %d t: %d d: %d\n", q, t, d);
if (t != d) t <<= 1; // t >= d
//fprintf(stdout, "t: %d >= d: %d\n", t, d);
b = 1;
while (t < q) {
//fprintf(stdout, "t: %d <= q: %d b: %d\n", t, q, b);
t <<= 1;
b <<= 1;
if (t == 0) break;
}
//fprintf(stdout, "... t: %d q: %d b: %d\n", t, q, b);
res = b;
for (;;) {
x = umultiply16(d, res); // assumes 16 bit quotient was formed as the product of 2 8-bit factors. If not, use umultiply16.
//fprintf(stdout, "trying %d * %d = %d against %d\n", d, res, x, q);
b >>= 1;
if (x == q) {
//fprintf(stdout, "%d / %d = %d (actual %d)\n", q, d, res, q / d);
return res;
} else if ((x <= q) && (x+d > q)) {
//fprintf(stdout, "%d / %d = %d rem %d (actual %d rem %d)\n", q, d, res, q-x, q / d, q % d);
return res;
} else if (x < q) {
res += b;
} else if (x > q) {
res -= b;
}
}
}
static uint8_t udiv16_by_8(uint16_t q, uint8_t d) {
uint16_t x, t;
uint16_t b, res;
t = msb8(d);
//fprintf(stdout, "q: %d t: %d d: %d\n", q, t, d);
if (t != d) t <<= 1; // t >= d
//fprintf(stdout, "t: %d >= d: %d\n", t, d);
b = 1;
while (t < q) {
//fprintf(stdout, "t: %d <= q: %d b: %d\n", t, q, b);
t <<= 1;
b <<= 1;
if (t == 0) break;
}
//fprintf(stderr, "... t: %d q: %d b: %d\n", t, q, b);
res = b;
for (;;) {
x = umul8(d, res); // assumes 16 bit quotient was formed as the product of 2 8-bit factors. If not, use umultiply16.
//fprintf(stdout, "trying %d * %d = %d against %d\n", d, res, x, q);
b >>= 1;
if (x == q) {
//fprintf(stdout, "%d / %d = %d (actual %d)\n", q, d, res, q / d);
return res;
} else if ((x <= q) && (x+d > q)) {
//fprintf(stdout, "%d / %d = %d rem %d (actual %d rem %d)\n", q, d, res, q-x, q / d, q % d);
return res;
} else if (x < q) {
res += b;
} else if (x > q) {
res -= b;
}
}
}
static uint32_t udiv32(uint32_t qq, uint32_t d) {
uint64_t x, q=qq;
uint32_t b, res, t;
t = msb32(d); if (t != d) t <<= 1; // t >= d
b = 1; while (t < q) { t <<= 1; b <<= 1; if (t == 0) break; } res = b;
for (;;) {
x = umultiply32bit(d, res); // umultiply16 assumes 32 bit quotient was formed as the product of 2 16-bit factors. If not, use umultiply32bit.
//fprintf(stdout, "trying %d * %d = %lld against %lld\n", d, res, x, q);
b >>= 1;
if (x == q) {
//fprintf(stdout, "%lld / %d = %d (actual %lld)\n", q, d, res, q / d);
return res;
} else if (x <= q && x+d > q) {
//fprintf(stdout, "%lld / %d = %d rem %lld (actual %lld rem %lld)\n", q, d, res, q-x, q / d, q % d);
return res;
} else if (x < q) {
res += b;
} else if (x > q) {
res -= b;
}
}
}
static uint16_t udiv32_by_16(uint32_t q, uint16_t d) {
uint32_t x, t;
uint16_t b, res;
t = msb16(d); if (t != d) t <<= 1; // t >= d
b = 1; while (t < q) { t <<= 1; b <<= 1; if (t == 0) break; } res = b;
for (;;) {
x = umultiply16(d, res); // assumes 32 bit quotient was formed as the product of 2 16-bit factors. If not, use umultiply32bit.
//fprintf(stderr, "trying %d * %d = %d against %d\n", d, res, x, q);
b >>= 1;
if (x == q) {
//fprintf(stderr, "%d / %d = %d (actual %d)\n", q, d, res, q / d);
return res;
} else if (x <= q && x+d > q) {
//fprintf(stderr, "%d / %d = %d rem %d (actual %d rem %d)\n", q, d, res, q-x, q / d, q % d);
return res;
} else if (x < q) {
res += b;
} else if (x > q) {
res -= b;
}
}
}
static uint32_t udiv64_by_32(uint64_t q, uint32_t d) {
uint64_t x, b, res, t;
t = msb64(d); if (t != d) t <<= 1; // t >= d
b = 1; while (t < q) { t <<= 1; b <<= 1; if (t == 0) break; } res = b; // might check bitsize of b here?
for (;;) {
x = umultiply32bit(d, res); // assumes 64 bit quotient was formed as the product of 2 32-bit factors. If not, this won't work.
//fprintf(stderr, "trying %d * %d = %d against %d\n", d, res, x, q);
b >>= 1;
if (x == q) {
//fprintf(stderr, "%d / %d = %d (actual %d)\n", q, d, res, q / d);
return res;
} else if (x <= q && x+d > q) {
//fprintf(stderr, "%d / %d = %d rem %d (actual %d rem %d)\n", q, d, res, q-x, q / d, q % d);
return res;
} else if (x < q) {
res += b;
} else if (x > q) {
res -= b;
}
}
}
// =====================================================================================
// vectrex random:
uint16_t _x, _a, _b, _c;
uint16_t vrandom() {
_x++; _a = (_a^_c^_x); _b = (_b+_a);
return _c = ((_c+(_b>>1))^_a);
}
void initRandom(uint16_t s1, uint16_t s2, uint16_t s3, uint16_t x0) {
_x = x0; _a = s1; _b = s2; _c = s3; (void)vrandom();
}
// main function
int main(void) {
uint8_t i;
#ifdef NO_HARDWARE_MUL
init_sq(512); // 256 if using the less efficient adder.
#endif
// Need to test GCC's native multiply for various types on Vectrex itself (int:8, long:16, long long:32)
initRandom(294, 4780, 11978, 9371);
// Need to do a lot more loops and time them to get comparitive timings for each method
for (i = 0; i < 10; i++) {
static uint16_t m = 0x7f, n = 0x7f;
uint16_t t;
t = m * n;
printf("Normal 8x8 multiplication m * n = %08x (%d = %d * %d)\n", t, t, m, n);
t = umul8(m, n);
printf("Using 8-bit mul m * n = %08x (%d)\n", t, t);
if ((m >= 0) && (n >= 0)) {
n = udiv16_by_8(t, m);
printf("Division %d / %d = %d\n", t, m, n);
m = 2;
n = udiv16(t, m);
printf("Division %d / %d = %d\n", t, m, n);
}
m = vrandom()&255; n = vrandom()&255;
}
for (i = 0; i < 10; i++) {
static int16_t m = 0x8000, n = 0x8000;
int32_t t, nn;
t = m * n;
printf("Normal 16x16 multiplication m * n = %08x (%d = %d * %d)\n", t, t, m, n);
t = multiply16bit(m, n);
printf("Using 8-bit multiplier A m * n = %08x (%d)\n", t, t);
t = multiply16(m, n);
printf("Using 8-bit multiplier B' m * n = %08x (%d)\n", t, t);
if ((m >= 0) && (n >= 0)) {
n = udiv32_by_16(t, m);
printf("Division %d / %d = %d\n", t, m, n);
m = 2;
nn = udiv32(t, (int32_t)m);
printf("Division %d / %d = %d\n", t, m, nn);
}
m = vrandom(); n = vrandom();
}
for (i = 0; i < 10; i++) {
static int32_t m = 0x80000000, n = 0x80000000;
int64_t t;
t = (int64_t)m * (int64_t)n;
printf("Normal 32x32 multiplication m * n = %016llx (%lld = %d * %d)\n", t, t, m, n);
t = multiply32bit(m, n);
printf("Using cascaded 8-bit multipliers A m * n = %016llx (%lld)\n", t, t);
t = multiply32(m, n);
printf("Using cascaded 8-bit multipliers B' m * n = %016llx (%lld)\n", t, t);
if ((m >= 0) && (n >= 0)) {
n = udiv64_by_32(t, m);
printf("Division %lld / %d = %d\n", t, m, n);
}
m = ((int32_t)vrandom()) * ((int32_t)vrandom()); n = ((int32_t)vrandom()) * ((int32_t)vrandom());
}
return 0;
}