/*W B Langdon at MUN 29 April 2007 $Revision: 1.24x $*/
/* WBL 11 Oct 2007 Cut down to just example code */

const float pi = 4*atan2(1,1);
const int NP = 512*512; //Number of programs in population
const int LEN = 15+1;  //Maximum GP individual length, allow stop code
unsigned char* Pop = new unsigned char[LEN*NP];

enum {
  OPNOP,
  OPADD,OPSUB,OPMUL,OPDIV,
  OPinput1,OPinput2,
  OPconst1,OPconst2};

  Array<1,Value1ub> PROG(LEN*NP); //unsigned 8-bit byte
class GPU {
public:
  //Avoid repeated gpu compilations by keeping m_update live
  rapidmind::Program m_update;
  GPU() {
  // General initialization of the platform
  rapidmind::init();

#define ADD stack(1)+stack(0)
#define SUB stack(1)-stack(0)
#define MUL stack(1)*stack(0)
#define DIV stack(1)/stack(0)
#define OPCODE ::PROG[PC+prog0]
#define OP1(XCODE,V) \
	stack = cond(XCODE==OPCODE,join(V,stack(0,1,2)),stack); \
	  PC  = cond(XCODE==OPCODE,PC+Value1i(1),PC);
//conditionally POP stack (fake by using rotation)
#define OP3(XCODE,OP) \
	stack = cond(XCODE==OPCODE,join(OP,stack(2,3,1)),stack); \
	  PC  = cond(XCODE==OPCODE,PC+Value1i(1),PC);
  m_update = RM_BEGIN {
		In<Value1i> prog0;
		Out<Value1f> top;
		Value1i PC = 0;
		Value4f stack;

		//ensure everyone gets 15 instructions.
		Value1i i = 0;
		FOR(i,i<(LEN-1),i++) {
		OP1(Value1ub(OPconst1),Value1f(0.8694791));
		OP1(Value1ub(OPconst2),Value1f(1.44024));
		OP3(Value1ub(OPADD),ADD);
		OP3(Value1ub(OPSUB),SUB);
		OP3(Value1ub(OPMUL),MUL);
		OP3(Value1ub(OPDIV),DIV);
		}
		ENDFOR

		top=stack[0];
  } RM_END;
  }
} gpu ;//endclass GPU

Array<1,Value1i> prog0(NP);     //used to simulate indexOf

int eval_Pop() {
  //Transfer GP population onto GPU
  unsigned char* input_PROG = PROG.write_data();
  memcpy(input_PROG,Pop,LEN*NP);

  Array<1,Value1f> top = gpu.m_update(prog0); //Run whole GP pop on GPU
  const float* result = top.read_data();      //get results from GPU

  beste = FLT_MAX;
  int bestp = 0;
  for(int i=0;i<NP;i++) {
    const float e = abs(pi-result[i]);
    if(e<beste) { beste=e; bestp=i;}
  }
  return bestp;
}

int main(int argc, char** argv) {
  // Optionally select specific backends. We'll let the platform decide on the
  // best one to use by not including any useBackend lines.
  // use_backend("glsl");
  // use_backend("cell");
  // use_backend("cc");

  // Access the internal arrays where the data is stored
  int* input_prog0 = prog0.write_data();

  // Fill the input arrays
  for(int i = 0; i<NP;i++) {
    input_prog0[i] = i*LEN; //point to start of this pixels PROG
  }

  create GP population on cpu.
  use eval_Pop to do all fitness evaluation on GPU. See
  http://www.cs.ucl.ac.uk/staff/W.Langdon/ftp/gp-code/

}