;Output driver for RGB colour 520-pixel (ARCADE) projects with 1/26-Mux
;Supported projects:
;  * ArcadeNano Colour
;Supported controllers: Mega644A, -P or -PA (not Mega644!) @ 18.432 MHz
;
;PORTS (ATmega644A/P/PA):
;  PA0-7 = [unused]
;  PB0   = LED Column Shift Register Data
;  PB1   = LED Column Shift Register Clock
;  PB2   = [unused]
;  PB3   = SD-Card Detect Switch (low = card is present)
;  PB4   = SD-Card #CS (voltage divider 1k/2.2k)
;  PB5   = SD-Card D_in (MOSI) (voltage divider 1k/2.2k) [MOSI]
;  PB6   = SD-Card D_out (MISO) (resistor 1k) [MISO]
;  PB7   = SD-Card Clock (voltage divider 1k/2.2k) [SCK]
;  PC0-7 = [unused]
;  PD0   = RS232 Rx [RXD0]
;  PD1   = RS232 Tx [TXD0]
;  PD2   = LED Row Shift Register Output Enable (active low)
;  PD3   = LED Row Shift Register Data [TXD1]
;  PD4   = LED Row Shift Register Clock [XCK1]
;  PD5   = LED Row Shift Register Latch
;  PD6-7 = [unused]
;
;Shift Registers:
;  Columns:
;    * 32 bits total (26 bits used)
;    * Q1 = column 1 = left ... Q26 = column 26 = right (scanning left to right)
;    * Q27~Q32 unused
;  Rows:
;    * 64 bits total (60 bits used)
;    * 4 dummy bits shifted first (appear on Q61~Q64)
;    * followed by bottom (row 20) LED blue data (appears on Q60)
;    * top (column 1) LED red data shifted last (appears on Q1)
;
;Frame Timing:
;  Controller frequency : 18.432 MHz
;  Timer prescaler      : 64
;  PWM "steps" per row  : 111
;  Multiplexing         : 26 columns
;    => 18.432 MHz / 64 / 111 / 26 = ~99.792 Hz (error: ~0.21 %)
;
;Example PWM durations:
;  0, 7, 14, 21, 28, 35, 60, 111 - similar to old optimized exponential
;  0, 7, 14, 21, 36, 57, 82, 111 - Gamma 2.0
;absolute minimum step between two values:  7 (= 448 cycles)
.equ	PWMVAL_0 = 0 ;don't change this value
.equ	PWMVAL_1 = 7
.equ	PWMVAL_2 = 14
.equ	PWMVAL_3 = 21
.equ	PWMVAL_4 = 28
.equ	PWMVAL_5 = 35
.equ	PWMVAL_6 = 48
.equ	PWMVAL_7 = 111 ;don't change this value (timing depends on it)

;===============================================================================

;SD card pins
.equ	USE_HARDWARE_SPI = 1
.equ	SD_PORT = PORTB
.equ	SD_CS = 4
.equ	SDCARD_HAS_DETECT_PIN = 1
.equ	SD_DETECT_PIN = PINB
.equ	SD_DETECT = 3 ;Card Detect switch pin

;row shift register pins - using hardware SPI on USART1
.equ	ROW_PORT = PORTD
.equ	ROW_PIN = PIND
.equ	ROW_ENABLE = 2
.equ	ROW_LATCH = 5

;column shift register pins
.equ	COL_PORT = PORTB
.equ	COL_PIN = PINB
.equ	COL_DATA = 0
.equ	COL_CLOCK = 1

;other settings
.equ	OUT_T0_DIV = 0x03 ;Clk/64
.equ	DISALLOW_UART = 0
.equ	DISALLOW_UART_TX = 0

;===============================================================================

.if ((MCU != MCU_MEGA644A) && (MCU != MCU_MEGA644P))
	.error "Output module for ARCADE RGB only works with ATmega644A/P/PA!"
.endif

.macro byte_odd_2_3_3 ;(35 cycles)
	;odd column: prepare byte for shifting = [ (G R) (B G R) (B G R) ]
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
.endmacro

.macro byte_odd_3_3_2 ;(39 cycles)
	;odd column: prepare byte for shifting = [ (B G R) (B G R) (B G) ]
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
.endmacro

.macro byte_odd_1_3_3_1 ;(38 cycles)
	;odd column: prepare byte for shifting = [ (R) (B G R) (B G R) (B) ]
	;(byte with red nibble already in 'temp')
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
.endmacro

.macro byte_even_2_3_3 ;(33 cycles)
	;even column: prepare byte for shifting = [ (G R) (B G R) (B G R) ]
	;(byte with green nibble already in 'temp')
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
.endmacro

.macro byte_even_3_3_2 ;(35 cycles)
	;even column: prepare byte for shifting = [ (B G R) (B G R) (B G) ]
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
.endmacro

.macro byte_even_1_3_3_1 ;(39 cycles)
	;even column: prepare byte for shifting = [ (R) (B G R) (B G R) (B) ]
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
.endmacro

.macro byte_shift
	;send byte to row shift register
  .if (OUT_INVERT_ROWS == 0)
	com	temp2
  .endif
	_out	UDR1, temp2
.endmacro

;--------------------

oc0:	;Timer 0 output compare interrupt (PWM steps)
	push	temp
	push	temp2
	_push_w	Z
	in	sreg_backup, SREG
	
	;increment column
.if (OUT_INVERT_COLUMNS)
	sbi	COL_PORT, COL_DATA
.else
	cbi	COL_PORT, COL_DATA
.endif
	dec	mux
	brne	oc0_colinc_end
	ldi	mux, WIDTH
.if (OUT_INVERT_COLUMNS)
	cbi	COL_PORT, COL_DATA
.else
	sbi	COL_PORT, COL_DATA
.endif
	;increment PWM value ('pwm' register stores pwm value in high nibble!)
	subi	pwm, -0x30 ;PWM value sequence: 3, 6, 1, 4, 7, 2, 5, (0)
	andi	pwm, 0x70
	brne	oc0_pwminc_end
	;zero: end of PWM sequence
	ldi	pwm, 0x30 ;start with first value of PWM sequence
	tick_100hz ;max. 26 cycles
oc0_pwminc_end:
	;load start address: first pixel in bottom row (bottom-to-top mux)
	activeframe Z ;max. 6 cycles
	_addi_w	Z, (WIDTH * CHANNELS / 2) * (HEIGHT - 1)
	_sts_w	RAM_MuxAddress, Z
oc0_colinc_end:
	
	;determine next interval
	mov	temp, pwm
	swap	temp
	_ldi_w	Z, (FLASH_OFFSET + pwm_duration) * 2
	add	ZL, temp
	adc	ZH, zero
	;TODO: use this (save 1 cycle):
	;mov	ZL, pwm
	;swap	ZL
	;clr	ZH
	;_addi_w	Z, (FLASH_OFFSET + pwm_duration) * 2
	lpm	temp, Z
	out	OCR0, temp
	
	;load framebuffer address for current row
	_lds_w	Z, RAM_MuxAddress
	
	;max. 72 cycles to this point (not including jump to 'oc0')
	
	sbrs	mux, 0 ;column = 1 (odd)  =>  mux = WIDTH = 26 (even)
	rjmp	oc0_col_odd
	
	;*********************************************
	; even column (RAM: [Z] = 0xRX, [Z+1] = 0xBG)
	;*********************************************
	
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	swap	temp
	cp	temp, pwm ;green
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;row 19
	ldd	temp, Z+1
	cp	temp, pwm ;blue
	rol	temp2
	                  ;byte 1 = [  x   x   x   x  . 20b 20g 20r 19b ]
        byte_shift
	byte_even_2_3_3   ;byte 2 = [ 19g 19r 18b 18g . 18r 17b 17g 17r ]
	byte_shift
	byte_even_3_3_2   ;byte 3 = [ 16b 16g 16r 15b . 15g 15r 14b 14g ]
	byte_shift
	byte_even_1_3_3_1 ;byte 4 = [ 14r 13b 13g 13r . 12b 12g 12r 11b ]
	byte_shift
	byte_even_2_3_3   ;byte 5 = [ 11g 11r 10b 10g . 10r  9b  9g  9r ]
	byte_shift
	byte_even_3_3_2   ;byte 6 = [  8b  8g  8r  7b .  7g  7r  6b  6g ]
	byte_shift
	byte_even_1_3_3_1 ;byte 7 = [  6r  5b  5g  5r .  4b  4g  4r  3b ]
	byte_shift
	byte_even_2_3_3   ;byte 8 = [  3g  3r  2b  2g .  2r  1b  1g  1r ]
	byte_shift
	
	;288 cycles (from first ldd to here)
	
	;calculate framebuffer address for next column:
	;  back to bottom row, add 2 for next column (no shared byte)
	_addi_w	Z, (WIDTH * CHANNELS / 2) * (HEIGHT - 1) + 2
	
	rjmp	oc0_col_end
	
oc0_col_odd:
	;********************************************
	; odd column (RAM: [Z] = 0xGR, [Z+1] = 0xXB)
	;********************************************
	
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	ld	temp, Z
	cp	temp, pwm ;green
	rol	temp2
	swap	temp
	cp	temp, pwm ;red
	rol	temp2
	sbiw	ZH:ZL, WIDTH * CHANNELS / 2 ;previous row
	ldd	temp, Z+1
	swap	temp
	cp	temp, pwm ;blue
	rol	temp2
	                 ;byte 1 = [  x   x   x   x  . 20b 20g 20r 19b ]
        byte_shift
	byte_odd_2_3_3   ;byte 2 = [ 19g 19r 18b 18g . 18r 17b 17g 17r ]
	byte_shift
	byte_odd_3_3_2   ;byte 3 = [ 16b 16g 16r 15b . 15g 15r 14b 14g ]
	byte_shift
	byte_odd_1_3_3_1 ;byte 4 = [ 14r 13b 13g 13r . 12b 12g 12r 11b ]
	byte_shift
	byte_odd_2_3_3   ;byte 5 = [ 11g 11r 10b 10g . 10r  9b  9g  9r ]
	byte_shift
	byte_odd_3_3_2   ;byte 6 = [  8b  8g  8r  7b .  7g  7r  6b  6g ]
	byte_shift
	byte_odd_1_3_3_1 ;byte 7 = [  6r  5b  5g  5r .  4b  4g  4r  3b ]
	byte_shift
	byte_odd_2_3_3   ;byte 8 = [  3g  3r  2b  2g .  2r  1b  1g  1r ]
	byte_shift

	;302 cycles (from first ldd to here)

	;calculate framebuffer address for next column:
	;  back to bottom row, add 1 for next column (odd/even shared byte)
	_addi_w	Z, (WIDTH * CHANNELS / 2) * (HEIGHT - 1) + 1

oc0_col_end:
	
	;max. 72 + 307 = 379 cycles to this point (not including jump to 'oc0')
	
	;store framebuffer address for next column
	_sts_w	RAM_MuxAddress, Z
	
	;restore registers (delay to wait for last SPI transfer)
	out	SREG, sreg_backup
	_pop_w	Z
	pop	temp2

	;update LEDs
	sbi	ROW_PORT, ROW_ENABLE ;LEDs off
	ldi	temp, 1<<COL_CLOCK ;advance to next column
	out	COL_PIN, temp ;toggle output
	out	COL_PIN, temp ;toggle output
	ldi	temp, 1<<ROW_LATCH ;latch new row data
	out	ROW_PIN, temp ;toggle output   (1st toggle >= 19 cycles after
	out	ROW_PIN, temp ;toggle output    last 'out' incl. 'out')
	pop	temp
	cbi	ROW_PORT, ROW_ENABLE ;LEDs on (with new data)
	
	;return
	reti
	
	;max. 72 + 307 + 27 = 406 cycles (not including jump to 'oc0')

;===============================================================================

.macro init_output
	
	;initialize registers
	ldi	mux, 1 ;last column in sequence
	ldi	pwm, 0x50 ;last PWM value in sequence

	;initialize ports
	;unused PORTA and PORTC: inputs with pull-ups
	ldi	temp, 0x00
	out	DDRA, temp
	out	DDRC, temp
	ldi	temp, 0xFF
	out	PORTA, temp
	out	PORTC, temp
	;PORTB: SD-Card, column shift register & 1 unused pin
	ldi	temp, 0xB3 ;outputs: SCK, MOSI, CS, COL_DATA, COL_CLOCK
	out	DDRB, temp
	ldi	temp, 0x4C ;pull-ups for MISO, SD_DETECT and unused pin
	out	PORTB, temp
	;PORTD: UART, row shift register & 2 unused: pull-ups for RxD & unused
	ldi	temp, 0x3E
	out	DDRD, temp
	ldi	temp, 0xC3 | 1<<ROW_ENABLE ;ROW_ENABLE is active low
	out	PORTD, temp
	
	;initialize data in column shift register (all columns off)
  .if (OUT_INVERT_COLUMNS)
	sbi	COL_PORT, COL_DATA
  .endif
	ldi	temp, 20
init_shift_loop:
	sbi	COL_PORT, COL_CLOCK
	cbi	COL_PORT, COL_CLOCK
	dec	temp
	brne	init_shift_loop
	
	;initialize USART1 for SPI master mode (row shift register)
	;(Tx only, MSB first, setup on falling edge, sample on rising edge)
	ldi	temp, 1<<UMSEL11 | 1<<UMSEL10
	_out	UCSR1C, temp
	ldi	temp, 1<<TXEN
	_out	UCSR1B, temp
	ldi	temp, 0 ;SCK frequency = Clk / 2
	_out	UBRR1H, temp
	_out	UBRR1L, temp
	
.endmacro
