#define VECTREX 1
// Although I have another file with generic divide code, this is specific to dividing by 10
// which is frequently needed when displaying numbers on screen, and worth optimizing.
// That said, I have not yet put this code on the clock to confirm timings.

// This file contains a 16-bit signed divide by 10,
// and 8-bit signed and unsigned divides by 10 (with two different
// implementations of each - time them and use the faster ones)

#ifdef VECTREX
#include <vectrex.h>
#include "muldiv.h"
// high is 0 on intel, 1 on 6809
#define HIGH 0
#define LOW  1
#define int8_t int
#define uint8_t unsigned int
#define int16_t long
#define uint16_t unsigned long
#define USE_MUL 1
#define EXIT_SUCCESS 0
#define EXIT_FAILURE 1
#else
#include <stdio.h>
#include <stdlib.h>
// high is 0 on intel, 1 on 6809
#define HIGH 0
#define LOW  1
#define int8_t signed char
#define uint8_t unsigned char
#define int16_t signed short
#define uint16_t unsigned short
#define USE_MUL 1
#define TEST
#endif

// If we read and write our parameter and result from a volatile pointer,
// we can ensure the compiler does not optimise expressions out of existence
// when the input is known at compile time!
static long long P1forced_access;
static volatile long long *P1LongLong = (volatile long long *)&P1forced_access;
static volatile      long *P1Long     = (volatile long *)&P1forced_access;
static volatile       int *P1Int      = (volatile int *)&P1forced_access;
static long long P2forced_access;
static volatile long long *P2LongLong = (volatile long long *)&P2forced_access;
static volatile      long *P2Long     = (volatile long *)&P2forced_access;
static volatile       int *P2Int      = (volatile int *)&P2forced_access;

static void SHOW_NUM(char *dest, long num) { // left-aligned
#define show_digit(x) *dest++ = (char)((x)+'0')
  int8_t digit, zeroes;

  // This replaces code that used divide by 10 and modulo 10.  Much faster.

  // handles full 16 bit range of -32768:32767  -  Uses negative numbers to avoid the issue of negating -32768

  if (num >= 0) num = -num; else *dest++ = '-';
  digit = 0;
  zeroes = 1; // CLRing is shorter
  // max 11 add/subtracts...
  if (num <= -20000) { num += 20000; digit += 2; zeroes = 0; }
  if (num <= -10000) { num += 10000; digit += 1; zeroes = 0; }
  if (!zeroes) show_digit(digit);
  digit = 0;
  if (num <= -8000) { num += 8000; digit += 8; zeroes = 0; } else if (num <= -4000) { num += 4000; digit += 4; zeroes = 0; }
  if (num <= -2000) { num += 2000; digit += 2; zeroes = 0; }
  if (num <= -1000) { num += 1000; digit += 1; zeroes = 0; }
  if (!zeroes) show_digit(digit);
  digit = 0;
  if (num <= -800) { num += 800; digit += 8; zeroes = 0; } else if (num <= -400) { num += 400; digit += 4; zeroes = 0; }
  if (num <= -200) { num += 200; digit += 2; zeroes = 0; }
  if (num <= -100) { num += 100; digit += 1; zeroes = 0; }
  if (!zeroes) show_digit(digit);
  digit = 0;
  if (num <= -80) { num += 80; digit += 8; zeroes = 0; } else if (num <= -40) { num += 40; digit += 4; zeroes = 0; }
  if (num <= -20) { num += 20; digit += 2; zeroes = 0; }
  if (num <= -10) { num += 10; digit += 1; zeroes = 0; }
  if (!zeroes) show_digit(digit);
  show_digit((int8_t)-num);
  *dest++ = ' ';
  *dest = 0x80;
}


// An accurate signed 16 bit division by 10
// Taken from http://homepage.divms.uiowa.edu/~jones/bcd/divide.html

static inline int16_t sdiv10_16(int16_t A) {
  int16_t Q; /* the quotient */
  int16_t R; /* the remainder */

  // Avoid using divide:
  Q = ((A >> 1) + A) >> 1; /* Q = A*0.11 */
  Q = ((Q >> 4) + Q)     ; /* Q = A*0.110011 */
  Q = ((Q >> 8) + Q) >> 3; /* Q = A*0.00011001100110011 */
  /* either Q = A/10 or Q+1 = A/10 for -87,381 < A < 534,890 */

#ifdef USE_MUL  // Need to time both cases on 6809 to see which is faster.
  R = Q * 10;
#else
  R = ((Q << 2) + Q) << 1;
#endif
  R = A - R; /* R = A - 10*Q */
  if (R >= 10) { R -= 10; Q += 1; }   /* Q = A/10 for -87,381 < A < 534,890 */
  if ((Q < 0) && (R != 0)) Q += 1;
#ifdef TEST  
  if (Q != A/10) {
    fprintf(stderr, "A: %d A/10: %d Q: %d\n", A, A/10, Q);
    exit(EXIT_FAILURE);
  }
#endif
  return Q;
}

// ditto with remainder exported to user
static inline int16_t sdiv10_16_with_remainder(int16_t A, int16_t *Remainder) {
  int16_t Q; /* the quotient */
  int16_t R; /* the remainder */

  // Avoid using divide:
  Q = ((A >> 1) + A) >> 1; /* Q = A*0.11 */
  Q = ((Q >> 4) + Q)     ; /* Q = A*0.110011 */
  Q = ((Q >> 8) + Q) >> 3; /* Q = A*0.00011001100110011 */
  /* either Q = A/10 or Q+1 = A/10 for -87,381 < A < 534,890 */

#ifdef USE_MUL  // Need to time both cases on 6809 to see which is faster.
  R = Q * 10;
#else
  R = ((Q << 2) + Q) << 1;
#endif
  R = -R + A; /* R = A - 10*Q */
  if (R >= 10) { R -= 10; Q += 1; }   /* Q = A/10 for -87,381 < A < 534,890 */
  if ((Q < 0) && (R != 0)) { R -= 10; Q += 1; }
  *Remainder = R;
#ifdef TEST  
  if ((Q != A/10) || (R != A%10)) {
    fprintf(stderr, "A: %d A/10: %d Q: %d  R: %d  A%%10: %d\n", A, A/10, Q, R, A%10);
    exit(EXIT_FAILURE);
  }
#endif
  return Q;
}

// divide signed 8-bit quantity by 10 using 16 bit intermediate at most
static inline int8_t s8divided_by_10(int8_t x) {
  int8_t rem;
  int8_t result;
  result = (int8_t)(((int16_t)x * 25L) >> 8L); // A*B should be the only 16 bit quantity
#ifdef USE_MUL
  //rem = -(result*10)+x; // *10 is marginally faster than shift+add.
  rem = (-10*result)+x; // yet to check if r*-10 faster than -(r*10). Probably is.
#else
  rem = -(((result << 2) + result) << 1) + x;
#endif
  if (rem > 0) {
    if ((x < 0) || (rem >= 10)) result++;
  }
#ifdef TEST
  if (result != x/10) {
    printf("s: %d %d %d rem %d\n", x, x/10, result, rem);
    exit(EXIT_FAILURE);
  }
#endif
  return result;
}

// divide unsigned 8-bit quantity by 10 using 16 bit intermediate at most
static inline uint8_t u8divided_by_10(uint8_t x) {
  uint8_t rem;
  uint8_t result;
  result = (uint8_t)(((uint16_t)x * 25UL) >> 8UL); // Again, A*B should be the only 16 bit quantity
  rem = x - result*10; // yet to check if *10 faster than shift+add. Probably is.
  if (rem >= 10) result++;
#ifdef TEST
  if (result != x/10U) {
    printf("u: %u %u %u rem %u\n", x, x/10U,result,rem);
    exit(EXIT_FAILURE);
  }
#endif
  return result;
}

static inline uint8_t divu10_8(uint8_t i) {
  uint8_t result, remainder;
  union twobytes {         // an experiment to see if the 6809 code generator can be forced to avoid using 8 double-byte LSR instructions
    uint16_t sixteenbit;   // Despite being lengthier source code this should be tighter machine code.
    uint8_t eightbit[2];
  } temp;
  temp.sixteenbit = (i+1)*51;
  result = temp.eightbit[LOW]>>1;
  remainder = i - result*10;
#ifdef TEST
  if (result != i/10) {
    printf("U: %u/10 = %u? %u rem %u\n", i, i/10, result, i - result*10);
    exit(EXIT_FAILURE);
  }
#endif
  return result;
}

static inline int8_t divs10_8(int8_t i) {
  int8_t result;
  union twobytes {         // an experiment to see if the 6809 code generator can be forced to avoid using 8 double-byte ASR instructions
    int16_t sixteenbit;
    int8_t eightbit[2];
  } temp;
  temp.sixteenbit = (i+1)*51;
  result = temp.eightbit[LOW]>>1;
  if ((i < 0) && (i > result*10)) result += 1; // there's probably a bit-twiddling expression to avoid two comparisons...
                                               // (but just casting i to uint8_t won't work)
#ifdef TEST
  if (result != i/10) {
    printf("S: %d/10 = %d? %d rem %d\n", i, i/10, result, i - result*10);
    exit(EXIT_FAILURE);
  }
#endif
  return result;
}

#define IRQ_6809_MASK 0x10
#define ENABLE_IRQ_6809 asm("andcc #0xef")
#define DISABLE_IRQ_6809 asm("orcc #0x10")

//static int Ready = 0;
// does not return, since we use the
// actual waitRecal "shortcut" to jump to "set_refresh" and return from there
__attribute__((noinline)) void iWaitRecal(void) // if inlined, the JMP -> RTS goes astray!!!
{
	asm("jmp 0xF1A2"); // Set_Refresh
}

volatile unsigned int8_t *rand = (volatile unsigned int *)0xc87b;
volatile unsigned int8_t *timer_lo = (volatile unsigned int *)0xD004;

static unsigned int16_t udiv16(unsigned int16_t qq, unsigned int16_t d) {
  unsigned int32_t x, q=qq;
  unsigned int16_t b, res, t;

  t = msb16(d);
  //fprintf(stdout, "q: %d  t: %d  d: %d\n", q, t, d);
  if (t != d) t <<= 1; // t >= d
  //fprintf(stdout, "t: %d >= d: %d\n", t, d);
  b = 1;
  while (t < q) {
    //fprintf(stdout, "t: %d <= q: %d  b: %d\n", t, q, b);
    t <<= 1;
    b <<= 1;
    if (t == 0) break;
  }
  //fprintf(stdout, "... t: %d  q: %d  b: %d\n", t, q, b);
  res = b;
  for (;;) {
    x = d*res; // assumes 16 bit quotient was formed as the product of 2 8-bit factors.  If not, use umultiply16.
    //fprintf(stdout, "trying %d * %d = %d against %d\n", d, res, x, q);
    b >>= 1;
    if (x == q) {
      //fprintf(stdout, "%d / %d = %d (actual %d)\n", q, d, res, q / d);
      return res;
    } else if ((x <= q) && (x+d > q)) {
      //fprintf(stdout, "%d / %d = %d rem %d  (actual %d rem %d)\n", q, d, res, q-x, q / d, q % d);
      return res;
    } else if (x < q) {
      res += b;
    } else if (x > q) {
      res -= b;
    }
  }
}

int main(void) {
  static unsigned long t1,t2;
  static char debug[12];
  Vec_Rfrsh = 0L;
  Vec_Loop_Count = 0L;


  for (;;) {
    Wait_Recal();
    Reset0Ref();
    Intensity_a(0x3f);
    {
      asm(" nop ");
// initialisation:
      *P1Long = 12345L;
      *P2Long = 10L;
      long p1 = *(long *)P1Long; // use of volatile forces code to be executed
      long p2 = *(long *)P2Long;
      // insert code to be tested here:
      t1 = dp_VIA_t2;
//=======================================================================================
      //asm(" nop "); // 4 cycles (not 3 like https://atjs.mbnet.fi/mc6809/Information/6809.htm )
      //asm(" adda #0 "); // 4 cycles
      //p1 = sdiv16_by_16(p1,p2); // 17626 cycles
      //p1 = sdiv16_by_8(p1,(int)p2); // 17634 cycles
      //p1 = (long)udiv16_by_16((unsigned long)p1,(unsigned long)p2); // 17180 cycles
      //p1 = (long)udiv16_by_8((unsigned long)p1,10); // 17138 cycles
      //p1 = p1/10L;  // 2364 cycles
      //p1 = p1/p2;  // 2360 cycles
      //p1 = sdiv10_16(p1); // 352 cycles
      p1 = (long)udiv16((unsigned long)p1,(unsigned long)p2); // 9760 cycles
//=======================================================================================
      t2 = dp_VIA_t2;
      t1 = (t1<<8) | ((t1>>8)&255);
      t2 = (t2<<8) | ((t2>>8)&255);
      *P1Long = p1; // force results to be calculated and not discarded
      *P2Long = p2;
    }

    SHOW_NUM(debug, (long)((t1-t2-13L)<<1)); // cycle count of operation above
    Print_Str_d(80, -40, debug);

    //SHOW_NUM(debug, (long)t1); // display the result as a check!
    SHOW_NUM(debug, (long int)*P1Long); // display the result as a check!
    Print_Str_d(40, -40, debug);

    continue;


    (void)(*P1LongLong);
    (void)(*P1Long);
    (void)(*P1Int);
    (void)(*P2LongLong);
    (void)(*P2Long);
    (void)(*P2Int);
  }
#ifdef NEVER
  { // signed 16-bit divide
    int16_t Q,R,A = 0;
    do Q = sdiv10_16_with_remainder(A++,&R); while (A);
    do Q = sdiv10_16(A++); while (A);
  }
  { // unsigned 8-bit divide
    uint8_t i, r2, r5;
    i = 0;
    for (;;) {
      if (i) {
        r2 = u8divided_by_10(i);
        r5 = divu10_8(i);
      }
      i++;
      if (i == 0) break;
    }
  }
  { // signed 8-bit divide
    int8_t i, r1, r2, r5;
    i = -128;
    for (;;) {
      if (i) {
        r1 = i/10;
        r2 = s8divided_by_10(i);
        r5 = divs10_8(i);
      }
      i++;
      if (i == -128) break;
    }
  }
#ifdef TEST
  printf("All test passed\n");
  exit(EXIT_SUCCESS);
#endif
#endif
  return (EXIT_FAILURE);
}