#pragma pack(16)	// 16 byte data align

#define SIMD

#include "vector.h"

vector4::vector4() { ; }	// un-initialised constructor
	
// initialises a vector from 3 or 4 floats
vector4::vector4(float x0, float y0, float z0, float w0=1.0f) {
	x = x0; y = y0; z = z0; w = w0;
}

// set value from 3 or 4 floats
void vector4::vec(float x0, float y0, float z0, float w0=1.0f) {
	x = x0; y = y0; z = z0; w = w0;
}

// clears vector to (0, 0, 0, 1)
void vector4::null(void) { x = y = z = 0.0f; w = 1.0f; }

// indexes the i-th component
float& vector4::operator[](int i)
{ return (&x)[i]; }
	
// evaluates the magnitude of the vector
float vector4::length(void)
{ return (float)sqrt(x*x+y*y+z*z); }

// inverts the vector
void vector4::negate(void)
{ x = -x; y = -y; z = -z; }

// normalises the vector
void vector4::normalize(void) {
	float len = (float)sqrt(x*x+y*y+z*z);
	if (len == 0.0f)
		return;
	x /= len; y /= len; z /= len;
}

// evaluate the cross-product
void vector4::cross(vector4& v1, vector4& v2) {
#ifdef SIMD
	__asm {
		mov esi, v1
		mov edi, v2
		
		movaps xmm0, [esi]
		movaps xmm1. [edi]
		movaps xmm2, xmm0
		movaps xmm3, xmm1
		
		shufps xmm0, xmm0, 0xc9
		shufps xmm1, xmm1, 0xd2
		mulps xmm2, xmm3

		shufps xmm2, xmm2, 0xd2
		shufps xmm3, xmm3, 0xc9
		mulps xmm2, xmm3

		subps xmm0, xmm2
		mov esi, this
		movaps [esi], xmm0
	}

#else
	x = v1.y*v2.z - v1.z*v2.y;
	y = v1.z*v2.x - v1.x*v2.z;
	z = v1.x*v2.y - v1.y*v2.x;
	w = 1.0f;

#endif
}
// define an operator: vector4+=vector4 for vector addition
void vector4::operator+=(vector4& v) {
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		addps xmm0, [edi]
		movaps [esi], xmm0
	}

#else
	x+= v.x; y+= v.y; z+= v.z; w+= v.w;

#endif
}
// define an operator: vector4-=vector4 for vector
// subtraction
void vector4::operator-=(vector4& v) {
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		subps xmm0, [edi]
		movaps [esi], xmm0
	}

#else
	x-= v.x; y-= v.y; z-= v.z; w-= v.w;

#endif
}
// define an operator: vector4*=vector4 for the dot product
void vector4::operator*=(vector4& v) {
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		mulps xmm0, [edi]
		movaps [esi], xmm0
	}

#else
	x*= v.x; y*= v.y; z*= v.z; w*= v.w;

#endif
}
	

// define an operator: vector4+vector4 for vector addition
vector4 vector4::operator+(vector4& v) {
	__declspec(align(16)) vector4 ret;
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		addps xmm0, [edi]
		movaps ret, xmm0
	}

#else
	ret.x = x+v.x; ret.y = y+v.y; ret.z = z+v.z; ret.w = w+v.w;

#endif
	return ret;
}
// define an operator: vector4-vector4 for vector
// subtraction
vector4 vector4::operator-(vector4& v) {
	__declspec(align(16)) vector4 ret;
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		subps xmm0, [edi]
		movaps ret, xmm0
	}

#else
	ret.x = x-v.x; ret.y = y-v.y; ret.z = z-v.z; ret.w = w-v.w;

#endif
	return ret;
}
// define an operator: vector4*vector4 for dot product
vector4 vector4::operator*(vector4& v) {
	__declspec(align(16)) vector4 ret;
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, v
		movaps xmm0, [esi]
		mulps xmm0, [edi]
		movaps ret, xmm0
	}

#else
	ret.x = x*v.x; ret.y = y*v.y; ret.z = z*v.z; ret.w = w*v.w;

#endif
	return ret;
}


// define an operator: vector4*mat4x4 to multiply a vector
// and a matrix
vector4 vector4::operator*(mat4x4& m) {
	__declspec(align(16)) vector4 a;
#ifdef SIMD
	__asm {
		mov esi, this
		mov edi, m

		movaps xmm0, [esi]
		movaps xmm1, xmm0
		movaps xmm2, xmm0
		movaps xmm3, xmm0

		shufps xmm0, xmm2, 0x00
		shufps xmm1, xmm2, 0x55
		shufps xmm2, xmm2, 0xAA
		shufps xmm3, xmm2, 0xFF

		mulps xmm0, [edi]
		mulps xmm1, [edi+16]
		mulps xmm2, [edi+32]
		mulps xmm3, [edi+48]

		addps xmm0, xmm1
		addps xmm0, xmm2
		addps xmm0, xmm3

		movaps a, xmm0
	}

#else
	float *f = (float *)this;
	a.x = x*f[0] + y*f[4] + z*f[8] + w*f[12];
	a.y = x*f[1] + y*f[5] + z*f[9] + w*f[13];
	a.z = x*f[2] + y*f[6] + z*f[10] + w*f[14];
	a.w = x*f[3] + y*f[7] + z*f[11] + w*f[15];

#endif
	return a;
}

//clears all matrix elements to zero
void mat4x4::null(void) {
	memset(&m, 0, sizeof(m));
}
// sets the matrix to an identity matrix
void mat4x4::load_identity(void) {
	memset(m, 0, sizeof(m));
	m[0][0] = m[1][1] = m[2][2] = m[3][3] = 1.0;
}
// sets up a matrix as a rotation or rad around vector vec
void mat4x4::set_rotation(vector4& dir, float rad);

// operator mat4x4*mat4x4
mat4x4 mat4x4::operator*(mat4x4& m1) {
	__declspec(align(16)) mat4x4 m2;
#ifdef SIMD
	__asm {
		mov edi, m1
		movaps xmm4, [edi]
		movaps xmm5, [edi+16]
		movaps xmm6, [edi+32]
		movaps xmm7, [edi+48]

		mov esi, this
		mov eax, 0

		LOOP:
		movaps xmm0, [esi+eax]
		movaps xmm1, xmm0
		movaps xmm2, xmm0
		movaps xmm3, xmm0

		shufps xmm0, xmm2, 0x00
		shufps xmm1, xmm2, 0x55
		shufps xmm2, xmm2, 0xAA
		shufps xmm3, xmm3, 0xFF

		mulps xmm0, [edi]
		mulps xmm1, [edi+16]
		mulps xmm2, [edi+32]
		mulps xmm3, [edi+48]

		addps xmm0, xmm1
		addps xmm0, xmm2
		addps xmm0, xmm3

		movaps m2+eax, xmm0

		add eax, 16
		cmp eax, 48
		jle LOOP
	}

#else
	int i, j, k;
	float ab;
	for (i=0; i<4; i++)
		for (j=0; j<4; j++) {
			ab = 0.0f;
			for(k=0; k<4; k++)
				ab += m[i][k]*m1.m[k][j];
			m2.m[i][j] = ab;
		}

#endif
	return m2;
}
// define an operator: mat4x4*vector4 to multiply a matrix
// and a vector
vector4 mat4x4::operator*(vector4& v) {
	__declspec(align(16)) vector4 a;
#ifdef SIMD
	__asm {
		mov esi, v
		mov edi, this

		movaps xmm0, [esi]
		movaps xmm1, xmm0
		movaps xmm2, xmm0
		movaps xmm3, xmm0

		shufps xmm0, xmm2, 0x00
		shufps xmm1, xmm2, 0x55
		shufps xmm2, xmm2, 0xAA
		shufps xmm3, xmm2, 0xFF

		mulps xmm0, [edi]
		mulps xmm1, [edi+16]
		mulps xmm2, [edi+32]
		mulps xmm3, [edi+48]

		addps xmm0, xmm1
		addps xmm0, xmm2
		addps xmm0, xmm3

		movaps a, xmm0
	}

#else
	float *f = (float *)this;
	a.x = v.x*f[0] + v.y*f[4] + v.z*f[8] + v.w*f[12];
	a.y = v.x*f[1] + v.y*f[5] + v.z*f[9] + v.w*f[13];
	a.z = v.x*f[2] + v.y*f[6] + v.z*f[10] + v.w*f[14];
	a.w = v.x*f[3] + v.y*f[7] + v.z*f[11] + v.w*f[15];

#endif
	return a;
}