Uploading attention core v0.1

059e050a · Francisco Barranco · e86681b9 · 059e050a · 059e050a · 059e050a
Commit 059e050a authored Feb 11, 2014 by Francisco Barranco
18 changed files
--- a/attention/attention_v0.1/GaborPrimitives.hcc
+++ b/attention/attention_v0.1/GaborPrimitives.hcc
+//********************************************************************
+// 
+//  Programmed by Javier Díaz, DRIVSCO project
+//  Granada, October 2009, version 1.0
+// 
+// Note: Francisco Barranco added some changes to this file. The functions
+// added have been documented. The rest has been used as they are.
+//********************************************************************
+#include "GaborPrimitives.hch"
+
+/*
+// Gabor filters kernels coefficients for the 14 bits (normalization to 2^14
+// **************************************************************************
+macro expr NORMY=  16384; //exp2(KERN_BITS); 
+macro expr NORMX=  NORMY/2;   // one bit more precision           
+         
+// FILTERS AND NORM COMPUTED BY KERNEL COEFICIENTS SUM = 1 --> this allow division by power of 2
+
+macro expr Gab1={         243     ,     601   ,     1214   ,     1989    ,    2671   ,     2948  };  // sym
+macro expr Gab2={         -12     ,    590    ,     -23    ,   -2012     ,    -23    ,    2960   };  // sym
+macro expr Gab3={        -254     ,      0    ,    1214    ,       0     ,  -2682    ,       0   };  // antisym
+macro expr Gab4={         127     ,   -220    ,   -1237    ,   -1260     ,   1133    ,    2914   };  // sym
+macro expr Gab5={         162     ,    590    ,     231    ,   -1584     ,  -2393    ,       0   };  // antisym
+macro expr Gab6={         127     ,    520    ,    -439    ,   -1966     ,    289    ,    2938   };  // sym
+macro expr Gab7={        -208     ,    289    ,    1145    ,    -474     ,  -2659    ,       0   };  // antisym
+macro expr Gab8={        -254     ,   -451    ,    -289    ,     717     ,   2197    ,    2948   };  // sym 
+macro expr Gab9={         -23     ,   -405    ,   -1179    ,   -1862     ,  -1526    ,       0   };  // antisym  */
+
+// Gabor filters kernels coefficients for the 12 bits (normalization to 2^12)
+// **************************************************************************
+macro expr NORMY=  4096/2; //exp2(KERN_BITS); // one bit more precision           
+macro expr NORMX=  4096/2;   
+
+macro expr Gab1={    61,   150,   304,   497,   668,   736};   // changed 737 for the norm
+macro expr Gab2={    -6,   147,    -6,  -502,    -3,   740};   // changed -503 for the DC component
+macro expr Gab3={   -64,     0,   304,     0,  -671,     0};   // antisym
+macro expr Gab4={    32,   -55,  -309,  -315,   283,   728};   // sym
+macro expr Gab5={    40,   147,    58,  -396,  -598,     0};   // antisym
+macro expr Gab6={    32,   130,  -110,  -491,    72,   734};   // sym
+macro expr Gab7={   -52,    72,   286,  -119,  -665,     0};   // antisym
+macro expr Gab8={   -64,  -113,   -72,   179,   549,   737};   // sym
+macro expr Gab9={    -6,  -101,  -295,  -465,  -382,     0};   // antisym
+
+
+
+//                  Recursive vector addition with ballanced tree
+// ************************************************************************************
+macro expr SumMacro(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom,Extend) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom, adjs(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1,Extend) + RecurseAddAux(Array, Middle, Bottom,Extend));
+        in
+            RecurseAddAux(Array, Index, begin,Extend);
+
+//                  Recursive vector addition with ballanced tree for unsigned
+//************************************************************************************
+macro expr UnSumMacro2(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom,Extend) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom, adju(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1,Extend) + RecurseAddAux(Array, Middle, Bottom,Extend));
+        in
+            RecurseAddAux(Array, Index, begin,Extend);  
+
+//                  Generic convolution kernel multiplication
+// *******************************************************************************/
+macro proc GenKernel_Gabor(buffer,Out,mask,norm, Symmetry) 
+{       
+macro expr Retiming=7;  // 8 no / 9 no
+macro expr PipeLatency=3+Retiming-1;
+macro expr DataWidth=(width(buffer[0])+KERN_BITS);  // Norm needs KERN_BITS + 1 (sign) + 8 (from 256 gray levels)
+
+const signed KERN_BITS kernel[6]=mask;
+signed DataWidth Register[6];
+
+signed (DataWidth) aux0;	
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+		ifselect (Symmetry==1)
+		{
+			par
+			{
+		        xilinxmult(Register[0], (adjs(buffer[0],(width(buffer[0])+1)) + adjs(buffer[10],(width(buffer[0])+1))), kernel[0]);			 					          
+		        xilinxmult(Register[1], (adjs(buffer[1],(width(buffer[0])+1)) + adjs(buffer[9],(width(buffer[0])+1))), kernel[1]);			 					          
+		        xilinxmult(Register[2], (adjs(buffer[2],(width(buffer[0])+1)) + adjs(buffer[8],(width(buffer[0])+1))), kernel[2]);				 					          
+		        xilinxmult(Register[3], (adjs(buffer[3],(width(buffer[0])+1)) + adjs(buffer[7],(width(buffer[0])+1))), kernel[3]);				 					                            
+                xilinxmult(Register[4], (adjs(buffer[4],(width(buffer[0])+1)) + adjs(buffer[6],(width(buffer[0])+1))), kernel[4]);				 					                            
+		        xilinxmult(Register[5], (adjs(buffer[5],(width(buffer[0])+1)) + 0									), kernel[5]);			 					          
+			}		
+		}
+		else	// antisymmetric kernel
+		{		
+			par
+			{   
+		        xilinxmult(Register[0], (adjs(buffer[10],(width(buffer[0])+1)) - adjs(buffer[0],(width(buffer[0])+1))), kernel[0]);			 					          
+		        xilinxmult(Register[1], (adjs(buffer[9],(width(buffer[0])+1)) - adjs(buffer[1],(width(buffer[0])+1))), kernel[1]);			 					          
+		        xilinxmult(Register[2], (adjs(buffer[8],(width(buffer[0])+1)) - adjs(buffer[2],(width(buffer[0])+1))), kernel[2]);				 					          
+                xilinxmult(Register[3], (adjs(buffer[7],(width(buffer[0])+1)) - adjs(buffer[3],(width(buffer[0])+1))), kernel[3]);				 					                            
+                xilinxmult(Register[4], (adjs(buffer[6],(width(buffer[0])+1)) - adjs(buffer[4],(width(buffer[0])+1))), kernel[4]);				 					                            
+		        Register[5]= 0;
+			}
+		} // end symemtry 
+		        
+        //aux0= adjs(Register[0],width(aux0)) + adjs(Register[1],width(aux0)) + adjs(Register[2],width(aux0))+ adjs(Register[3],width(aux0))+ adjs(Register[4],width(aux0))+ adjs(Register[5],width(aux0));
+        aux0= SumMacro(Register, 0, 5, width(aux0));
+
+        // CAREFULL, NORM/2 NEVER SHOULD OVERFLOW DATA!!!
+      /*  if (aux0>=0)
+            aux[0]= ((aux0+norm/2)/norm)<-(width(Out));  // 
+        else
+            aux[0]= ((aux0-norm/2)/norm)<-(width(Out));  // */
+        
+        aux[0]= ((aux0)/norm)<-(width(Out));  // */
+		//aux[0]= (aux0>>11)<-(width(Out));
+      
+        par(i=1;i<(Retiming);i++)
+        {
+            aux[i]=aux[i-1];
+        }
+        Out= aux[Retiming-1];
+		//Out=adjs((adjs(buffer[2],11)*kernel[0]),width(Out));
+
+    } // End main par
+}
+
+
+// ************************************************************************************
+//                  GENERIC X-Y SEPARABLE CONVOLUTION --> TESTEADA!!!
+// ************************************************************************************
+macro proc GenericConvolution(Input, Output, X_FIR, Y_FIR, NTaps, NTapsMinus1, ColumnLength,normx, normy, Sx,Sy)
+{  
+    macro expr PipeLatency= 2 + 1 +1 + 4*2;  // 2 from main, 1 input, 1 output, 4*2 kernels
+    macro expr Retiming=1;      // Retiming value = Retiming-1    
+
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];     	//  Read port
+     	wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	 	//  Write port
+	} ColumnsBuffer[NTapsMinus1] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+    
+    unsigned (log2ceil((MAX_RES_X/SCALE))) col, colbis;
+    signed (width(Input)) DataArrayX[NTaps], DataArrayY[NTaps], aux[Retiming]  ;
+
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+
+        // Read data into array every cycle           
+      	DataArrayX[NTaps-1]=Input;
+            
+       /// Shift X data through array
+       	par (i = 0; i != (NTaps-1); i++) // NOTE WIDTH(i)=LOG2CEIL(NTaps)
+       	{
+            DataArrayX[i] = DataArrayX[i+1];            
+       	}
+                                                    
+       GenKernel_Gabor(DataArrayX,DataArrayY[NTapsMinus1],X_FIR,normx,Sx);	   
+                        	    			       
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        // Operations by columns                
+        col= col>=(ColumnLength-1) ? 0 : col+1;
+  		colbis= col;
+					
+  		// Read data into array every cycle 
+   		par(r1=0;r1!=NTapsMinus1;r1++)    
+   		{
+  			// Fill data through array
+       	    DataArrayY[r1] = readRAM(r1<-(log2ceil(NTapsMinus1)),col);		
+   		}	
+
+   		// Shift array and write data into block RAMs every cycle 
+   		par(r2=0;r2!=NTapsMinus1;r2++)
+   		{
+            writeRAM(r2<-(log2ceil(NTapsMinus1)),colbis,DataArrayY[r2+1]);
+   		}																							                                
+        
+		GenKernel_Gabor(DataArrayY,Output,Y_FIR,normy,Sy) ;             
+        
+/*        Y_FIR(DataArrayY,aux[0]) ;             
+        par(i=1;i<(Retiming);i++)
+        {
+            aux[i]=aux[i-1];
+        }
+        Output= aux[Retiming-1];*/				 		
+
+
+    } // End Global par
+}
+
+
+void GenKernel_Gabor_Gab0[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab1,NORMY, 1);
+    
+}
+void GenKernel_Gabor_Gab1[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab2,NORMY, 1);
+    
+}
+void GenKernel_Gabor_Gab2[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab3,NORMY, -1);
+    
+}
+void GenKernel_Gabor_Gab3[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab4,NORMY, 1);
+    
+}
+void GenKernel_Gabor_Gab4[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab5,NORMY, -1);
+    
+}
+void GenKernel_Gabor_Gab5[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab6,NORMY, 1);
+    
+}
+void GenKernel_Gabor_Gab6[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab7,NORMY, -1);
+    
+}
+void GenKernel_Gabor_Gab7[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab8,NORMY, 1);
+    
+}
+void GenKernel_Gabor_Gab8[2](signed int CONV_BITS *buffer, signed CONV_BITS *Out, unsigned FLOW_INDEX_BITS index)
+{
+    
+    GenKernel_Gabor(buffer,(*Out),Gab9,NORMY, -1);
+    
+}
+
+
+
+//                  Y CONVOLUTION FILTERS
+// ************************************************************************************				
+
+
+macro proc GaborY(Input, FNY, NTaps, NTapsMinus1, ColumnLength)
+{  
+    macro expr PipeLatency= 2 + 1 +1 + 4*2;  // 2 from main, 1 input, 1 output, 4*2 kernels
+    macro expr Retiming=1;      // Retiming value = Retiming-1    
+
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];     	//  Read port
+     	wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	 	//  Write port
+	} ColumnsBuffer[NTapsMinus1] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+    
+    static unsigned (log2ceil((MAX_RES_X/SCALE))) col=((MAX_RES_X/SCALE)+1 -2-1-4), colbis=((MAX_RES_X/SCALE) -2-1-8);
+    signed (width(Input)) DataArrayY[NTaps];//, DataArray1[NTaps],DataArray2[NTaps],aux[Retiming]  ; 
+	signal signed GHaux;
+
+    //          Macro Begin
+    // ----------------------------------------------------
+    
+    par
+    {
+        // Read data into array every cycle           
+      	DataArrayY[NTapsMinus1]=Input;                   
+        
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        // Operations by columns 
+        par
+        {
+            col= col>=(ColumnLength-1) ? 0 : col+1;
+            colbis= col;
+        }       
+					
+  		// Read data into array every cycle 
+   		par(r1=0;r1!=(NTaps-1);r1++)    // NOTE WIDTH(i)=LOG2CEIL(NTaps)
+   		{
+  			// Fill data through array
+       	    DataArrayY[r1] = readRAM(r1<-(log2ceil(NTapsMinus1)),col);							
+   		}	
+
+   		// Shift array and write data into block RAMs every cycle 
+   		par(r2=0;r2!=(NTaps-1);r2++)
+   		{
+            writeRAM(r2<-(log2ceil(NTapsMinus1)),colbis,DataArrayY[r2+1]);
+   		}	
+		
+        GenKernel_Gabor(DataArrayY,(FNY[0]),Gab1,NORMY, 1);
+        GenKernel_Gabor(DataArrayY,(FNY[1]),Gab2,NORMY, 1);
+        GenKernel_Gabor(DataArrayY,(FNY[2]),Gab3,NORMY, -1);
+        GenKernel_Gabor(DataArrayY,(FNY[3]),Gab4,NORMY, 1);
+        GenKernel_Gabor(DataArrayY,(FNY[4]),Gab5,NORMY, -1);
+        GenKernel_Gabor(DataArrayY,(FNY[5]),Gab6,NORMY, 1);
+        GenKernel_Gabor(DataArrayY,(FNY[6]),Gab7,NORMY, -1);
+        GenKernel_Gabor(DataArrayY,(FNY[7]),Gab8,NORMY, 1);
+        GenKernel_Gabor(DataArrayY,(FNY[8]),Gab9,NORMY, -1);
+     } // End Global par
+}
+
+//                  GABOR FILTERS BASE SET
+// ************************************************************************************
+//                  NO SHARING
+// ************************************************************************************
+
+macro proc GenericConvolutionX(Input, Output, X_FIR, NTaps,norm, sym)
+{  
+    	
+    signed (width(Input)) DataArrayX[NTaps];
+	
+    //          Macro Begin
+    // ----------------------------------------------------
+    //assert (1  == 24, 0, "Application requires %d",log2ceil(8));
+    par
+    {
+        // Read data into array every cycle           
+      	DataArrayX[NTaps-1]=Input;
+            
+       	// Shift X data through array
+       	par (i = 0; i != (NTaps-1); i++) // NOTE WIDTH(i)=LOG2CEIL(NTaps)
+       	{
+            DataArrayX[i] = DataArrayX[i+1];            
+       	}
+                                                    
+       GenKernel_Gabor(DataArrayX,Output,X_FIR,norm,sym) ;
+   }
+}
+
+macro proc GaborBase(DataIn, FNYNX,Columns)
+{
+    macro expr NTAPS=11;
+    signed CONV_BITS FNY[9];
+    
+	/*
+    FNYNX[0]= F1Y2X
+    FNYNX[1]= F1Y3X
+    FNYNX[2]= F2Y1X
+    FNYNX[3]= F3Y1X
+    FNYNX[4]= F4YF4X
+    FNYNX[5]= F5YF5X
+    FNYNX[6]= F5YF4X
+    FNYNX[7]= F4YF5X
+    FNYNX[8]= F8YF6X
+    FNYNX[9]= F9YF7X
+    FNYNX[10]=F9YF6X
+    FNYNX[11]=F8YF7X
+    FNYNX[12]=F6YF8X
+    FNYNX[13]=F7YF9X
+    FNYNX[14]=F7YF8X
+    FNYNX[15]=F6YF9X    
+*/
+    par // ELIMINAR _INDEX!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+	{	
+        //Y-convolution
+		GaborY(DataIn, FNY, NTAPS, (NTAPS-1), Columns);
+        
+        //X-convolutions
+        GenericConvolutionX(FNY[0], FNYNX[0] , Gab2, NTAPS, (NORMX), 1);          
+        GenericConvolutionX(FNY[0], FNYNX[1] , Gab3, NTAPS, (NORMX), -1);          
+        GenericConvolutionX(FNY[1], FNYNX[2] , Gab1, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[2], FNYNX[3] , Gab1, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[3], FNYNX[4] , Gab4, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[4], FNYNX[5] , Gab5, NTAPS, (NORMX), -1);                  
+        GenericConvolutionX(FNY[4], FNYNX[6] , Gab4, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[3], FNYNX[7] , Gab5, NTAPS, (NORMX), -1);                  
+        GenericConvolutionX(FNY[7], FNYNX[8] , Gab6, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[8], FNYNX[9] , Gab7, NTAPS, (NORMX), -1);                  
+        GenericConvolutionX(FNY[8], FNYNX[10], Gab6, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[7], FNYNX[11], Gab7, NTAPS, (NORMX), -1);                  
+        GenericConvolutionX(FNY[5], FNYNX[12], Gab8, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[6], FNYNX[13], Gab9, NTAPS, (NORMX), -1);                  
+        GenericConvolutionX(FNY[6], FNYNX[14], Gab8, NTAPS, (NORMX), 1);                  
+        GenericConvolutionX(FNY[5], FNYNX[15], Gab9, NTAPS, (NORMX), -1);                          
+	}
+}
+
+
+//                  SHARING
+// ************************************************************************************
+
+macro proc GenericConvolutionX_index(Input, Output, X_FIR, NTaps,norm, sym, sharerProcesses, index)
+{  
+    static signed (width(Input)) DataArrayX[sharerProcesses][NTaps];
+	
+    //          Macro Begin
+    // ----------------------------------------------------
+    //assert (1  == 24, 0, "Application requires %d",log2ceil(8));
+    par
+    {
+        // Read data into array every cycle           
+      	DataArrayX[index][NTaps-1]=Input;
+            
+       	// Shift X data through array
+       	par (i = 0; i != (NTaps-1); i++) // NOTE WIDTH(i)=LOG2CEIL(NTaps)
+       	{
+            DataArrayX[index][i] = DataArrayX[index][i+1];            
+       	}
+                                                    
+        GenKernel_Gabor(DataArrayX[index],Output,X_FIR,norm,sym);
+        //Output=DataArrayX[0];
+   }
+}
+
+void functionGenericConvolutionX_index_stereo(signed CONV_BITS *FNY, signed CONV_BITS *FNYNX, unsigned DISPARITY_INDEX_BITS index)
+{
+    macro expr NTAPS=11;
+/*
+    FNYNX[0]= F1Y2X
+    FNYNX[1]= F1Y3X
+    FNYNX[2]= F2Y1X
+    FNYNX[3]= F3Y1X
+    FNYNX[4]= F4YF4X
+    FNYNX[5]= F5YF5X
+    FNYNX[6]= F5YF4X
+    FNYNX[7]= F4YF5X
+    FNYNX[8]= F8YF6X
+    FNYNX[9]= F9YF7X
+    FNYNX[10]=F9YF6X
+    FNYNX[11]=F8YF7X
+    FNYNX[12]=F6YF8X
+    FNYNX[13]=F7YF9X
+    FNYNX[14]=F7YF8X
+    FNYNX[15]=F6YF9X    
+*/
+    par
+    {
+        GenericConvolutionX_index(FNY[0], FNYNX[0] , Gab2, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);          
+        GenericConvolutionX_index(FNY[0], FNYNX[1] , Gab3, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);          
+        GenericConvolutionX_index(FNY[1], FNYNX[2] , Gab1, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[2], FNYNX[3] , Gab1, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[3], FNYNX[4] , Gab4, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[4], FNYNX[5] , Gab5, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[4], FNYNX[6] , Gab4, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[3], FNYNX[7] , Gab5, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[7], FNYNX[8] , Gab6, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[8], FNYNX[9] , Gab7, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[8], FNYNX[10], Gab6, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[7], FNYNX[11], Gab7, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[5], FNYNX[12], Gab8, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[6], FNYNX[13], Gab9, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[6], FNYNX[14], Gab8, NTAPS, (NORMX), 1, MAX_PROC_DISPARITY, index);                  
+        GenericConvolutionX_index(FNY[5], FNYNX[15], Gab9, NTAPS, (NORMX), -1, MAX_PROC_DISPARITY, index);                          
+    }
+}
+
+void functionGenericConvolutionX_index_flow(signed CONV_BITS *FNY, signed CONV_BITS *FNYNX, unsigned FLOW_INDEX_BITS index)
+{
+    macro expr NTAPS=11;
+/*
+    FNYNX[0]= F1Y2X
+    FNYNX[1]= F1Y3X
+    FNYNX[2]= F2Y1X
+    FNYNX[3]= F3Y1X
+    FNYNX[4]= F4YF4X
+    FNYNX[5]= F5YF5X
+    FNYNX[6]= F5YF4X
+    FNYNX[7]= F4YF5X
+    FNYNX[8]= F8YF6X
+    FNYNX[9]= F9YF7X
+    FNYNX[10]=F9YF6X
+    FNYNX[11]=F8YF7X
+    FNYNX[12]=F6YF8X
+    FNYNX[13]=F7YF9X
+    FNYNX[14]=F7YF8X
+    FNYNX[15]=F6YF9X    
+*/
+    par
+    {
+        GenericConvolutionX_index(FNY[0], FNYNX[0] , Gab2, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);          
+        GenericConvolutionX_index(FNY[0], FNYNX[1] , Gab3, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);          
+        GenericConvolutionX_index(FNY[1], FNYNX[2] , Gab1, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[2], FNYNX[3] , Gab1, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[3], FNYNX[4] , Gab4, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[4], FNYNX[5] , Gab5, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[4], FNYNX[6] , Gab4, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[3], FNYNX[7] , Gab5, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[7], FNYNX[8] , Gab6, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[8], FNYNX[9] , Gab7, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[8], FNYNX[10], Gab6, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[7], FNYNX[11], Gab7, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[5], FNYNX[12], Gab8, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[6], FNYNX[13], Gab9, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[6], FNYNX[14], Gab8, NTAPS, (NORMX), 1, MAX_PROC_FLOW, index);                  
+        GenericConvolutionX_index(FNY[5], FNYNX[15], Gab9, NTAPS, (NORMX), -1, MAX_PROC_FLOW, index);                          
+        
+    }
+}
+
+//                  BUILDING GABOR FILTERS
+// ************************************************************************************				
+macro proc BuildGabor(FNYNX,fe,fo)
+{   
+    
+    /*
+    
+1)  even F1Y2X, odd F1Y3X
+5)  even F2Y1X, odd F3Y1X
+3)  even = F4YF4X - F5YF5X;  odd = F5YF4X + F4YF5X;
+7)  even = F4YF4X + F5YF5X;  odd = F5YF4X - F4YF5X;
+2)  even = F8YF6X - F9YF7X;  odd = F9YF6X + F8YF7X; 
+8)  even = F8YF6X + F9YF7X;  odd = F9YF6X - F8YF7X;
+4)  even = F6YF8X - F7YF9X;  odd = F7YF8X + F7YF8X;
+6)  even = F6YF8X + F7YF9X;  odd = F7YF8X - F6YF9X;
+
+FNYNX[0]= F1Y2X
+FNYNX[1]= F1Y3X
+FNYNX[2]= F2Y1X
+FNYNX[3]= F3Y1X
+FNYNX[4]= F4YF4X
+FNYNX[5]= F5YF5X
+FNYNX[6]= F5YF4X
+FNYNX[7]= F4YF5X
+FNYNX[8]= F8YF6X
+FNYNX[9]= F9YF7X
+FNYNX[10]=F9YF6X
+FNYNX[11]=F8YF7X
+FNYNX[12]=F6YF8X
+FNYNX[13]=F7YF9X
+FNYNX[14]=F7YF8X
+FNYNX[15]=F6YF9X
+
+1)  even 0, odd 1
+5)  even 2, odd 3
+3)  even = 4 - 5;   odd = 6 + 7;
+7)  even = 4 + 5;   odd = 6 - 7;
+2)  even = 8 - 9;   odd = 10+ 11; 
+8)  even = 8 + 9;   odd = 10- 11;
+4)  even = 12 - 13; odd = 14+ 15;
+6)  even = 12+ 13;  odd = 14- 15;
+
+0)  even 0, odd 1
+1)  even = 8 - 9;   odd = 10+ 11; 
+2)  even = 4 - 5;   odd = 6 + 7;
+3)  even = 12 - 13; odd = 14+ 15;
+4)  even 2, odd 3
+5)  even = 12+ 13;  odd = 14- 15;    
+6)  even = 4 + 5;   odd = 6 - 7;
+7)  even = 8 + 9;   odd = 10- 11;
+    
+    */
+       
+	par
+	{
+        fe[0]=FNYNX[0];
+        fo[0]=FNYNX[1];
+        fe[1]=FNYNX[8]  - FNYNX[9];
+        fo[1]=FNYNX[10] + FNYNX[11];
+        fe[2]=FNYNX[4]  - FNYNX[5];
+        fo[2]=FNYNX[6]  + FNYNX[7];
+        fe[3]=FNYNX[12] - FNYNX[13];
+        fo[3]=FNYNX[14] + FNYNX[15];
+        fe[4]=FNYNX[2];
+        fo[4]=FNYNX[3];
+        fe[5]=FNYNX[12] + FNYNX[13];
+        fo[5]=FNYNX[14] - FNYNX[15];
+        fe[6]=FNYNX[4]  + FNYNX[5];
+        fo[6]=FNYNX[6]  - FNYNX[7];
+        fe[7]=FNYNX[8]  + FNYNX[9];
+        fo[7]=FNYNX[10] - FNYNX[11];        
+    }
+}
+void BuildGabor_function[2](signed CONV_BITS *FNYNX, signed F_BITS *fe, signed F_BITS *fo)
+{
+    BuildGabor(FNYNX,fe,fo);
+}
+
+// 				CORES FOR ATAN FUNCTION COMPUTATION AND SQRT
+// ************************************************************************************		
+macro proc CoreATAN2CORDICPHI(y, x, ena, angle,data_rdy)
+{
+macro expr CoreWidthIn	= ATAN2COREWIDTHIN_P;
+macro expr CoreWidthOut	= ATAN2COREWIDTHOUT_P;    
+macro expr CoreLatency	= ATAN2CORELATENCY_P;
+
+signal aux;
+    
+#ifdef DEBUG
+    angle=0@y@x;
+#else
+/*
+component atan2cordic24
+	port (
+	x_in: IN std_logic_VECTOR(23 downto 0);
+	y_in: IN std_logic_VECTOR(23 downto 0);
+	phase_out: OUT std_logic_VECTOR(9 downto 0);
+	rdy: OUT std_logic;
+	clk: IN std_logic;
+	ce: IN std_logic);
+end component;
+*/			
+	interface ATAN2CORENAME_P(signed CoreWidthOut phase_out, unsigned 1 rdy) atan2(signed  CoreWidthIn x_in=-x, 
+							signed CoreWidthIn y_in=y, unsigned 1 clk=__clock, unsigned 1 ce=ena) with {busformat="B<I>"};
+    par
+    {
+        //assert (width(aux)==3, 0, "Width of x is not 3 (it is %d)", width(aux));
+        // Left shift to utilize the unused bit of the 2QN  core output format (range +/- 1, not above)
+    	aux=(-(atan2.phase_out<<1));
+        data_rdy=atan2.rdy;                            
+        angle=aux;
+    }
+#endif
+}
+// Core reppliaction for chipscope debugging
+macro proc CoreATAN2CORDICORI(y, x, ena, angle,data_rdy)
+{
+macro expr CoreWidthIn	= ATAN2COREWIDTHIN;
+macro expr CoreWidthOut	= ATAN2COREWIDTHOUT;    
+macro expr CoreLatency	= ATAN2CORELATENCY;
+signal aux;
+    
+#ifdef DEBUG
+    angle=y@x;
+#else
+/*
+component atan2cordic24
+	port (
+	x_in: IN std_logic_VECTOR(23 downto 0);
+	y_in: IN std_logic_VECTOR(23 downto 0);
+	phase_out: OUT std_logic_VECTOR(9 downto 0);
+	rdy: OUT std_logic;
+	clk: IN std_logic;
+	ce: IN std_logic);
+end component;
+*/			
+	interface ATAN2CORENAME(signed CoreWidthOut phase_out, unsigned 1 rdy) atan2(signed  CoreWidthIn x_in=x, 
+							signed CoreWidthIn y_in=-y, unsigned 1 clk=__clock, unsigned 1 ce=ena) with {busformat="B<I>"};
+    par
+    {
+    	aux=(atan2.phase_out)>>1;
+        data_rdy=atan2.rdy;         
+        if (aux<0) // Left shift 2 bits to utilize the unused bit of the 2QN  core output format + the sign bit (range 0,1, not above, nor negative)
+            angle=((aux + ((signed 24)0x200000))<<2)\\(ATAN2COREWIDTHOUT-PHASE_ORI_BITS); // adding 0.5x2 (2Q9 format) to warp orientation to [0,pi)
+        else 
+            angle=(aux<<2)\\(ATAN2COREWIDTHOUT-PHASE_ORI_BITS);        
+    }   // Shift left is allow because in fact, angle values are positive
+#endif
+
+}
+//           SQRT Core
+//---------------------------------------
+macro proc CoreSQRT(input, ena, output, data_rdy)
+{
+macro expr SqrtWidthIn	= SQRTCOREWIDTHIN;
+macro expr SqrtWidthOut	= SQRTCOREWIDTHOUT+1;
+macro expr SqrtLatency	= SQRTCORELATENCY;
+
+#ifndef DEBUG
+/*
+    component sqrtcordic20
+	port (
+	x_in: IN std_logic_VECTOR(19 downto 0);
+	x_out: OUT std_logic_VECTOR(10 downto 0); --> theorethically 11 bits, but this is a very rare case, 10 bits is enouh
+	rdy: OUT std_logic;
+	clk: IN std_logic;
+	ce: IN std_logic);
+end component;     */    
+	interface SQRTCORENAME(unsigned SqrtWidthOut x_out, unsigned 1 rdy) sqrt (unsigned  SqrtWidthIn x_in=input, 
+                            unsigned 1 ce= ena, unsigned 1 clk=__clock) with {busformat="B<I>"};
+    par
+    {
+        output = sqrt.x_out<-SQRTCOREWIDTHOUT;
+        data_rdy=sqrt.rdy;    
+    }
+#else
+     output=input;     
+#endif		
+
+}
+  /*  extern "C"
+    {
+        
+        int cocosine(int a);
+    } */
+//           COSLUT Core
+//---------------------------------------
+macro proc CoreCosLUT(input, output)
+{
+macro expr cosLUTWidth	= COSLUTCOREWIDTH;
+
+#ifndef DEBUG
+    /*
+component wrapped_cosLUT
+	port (
+	THETA: IN std_logic_VECTOR(9 downto 0);
+	COSINE: OUT std_logic_VECTOR(9 downto 0));
+end component;  */
+	interface COSLUTCORENAME(signed cosLUTWidth  COSINE) cosineLUT (unsigned  cosLUTWidth THETA=input) with {busformat="B<I>"};
+    par
+    {
+        output = cosineLUT.COSINE;
+  
+    }
+#else
+   
+     output=input;//cocosine(adjs(input,32));       
+#endif		
+
+}
+
+//                  COMPUTING PHASE, MAGNITUDE AND ORIENTATION
+// ************************************************************************************		
+macro proc Primitives(fe,fo,Energy, Orientation, TH ,Latencies)
+{
+	// Sine and cosine LUTs for orientation computation
+	// ******************************************************
+	const signed TRIG_BITS sin[NORIENTATIONS]={  0,  91, 128,  91,  0,  -91, -128,  -91};
+
+	const signed TRIG_BITS cos[NORIENTATIONS]={ 128, 91 ,  0  ,-91 ,-128 , -91 ,  0 ,91};
+    
+    // MAL!!! const signed COSLUTCOREWIDTH angleLUT[NORIENTATIONS]={ 0, 50 ,  101, 151, 201, 251, 302, 352};
+    const unsigned COSLUTCOREWIDTH angleLUT[NORIENTATIONS]={0  ,  64  , 128  , 192  , 256  , 320  , 384 ,  448};
+    
+    // Pipeline equalization delays.  
+    macro expr LATENCIESOFFSET = 15; // + 15 for Softwarre adjustment, 0 for final hardware implementation
+    macro expr PIPEenergy   = 17;      
+    macro expr PIPEOri      = 40;   
+    macro expr PIPEPhase    = PIPEOri+40;
+    macro expr EQPIPEenergy = PIPEPhase-PIPEenergy-1+1+LATENCIESOFFSET  ; // -1 from Software adjustment     
+    macro expr EQPIPEOri    = PIPEPhase-PIPEOri-1+1+LATENCIESOFFSET  ; // + 4 from Software adjustment  
+    macro expr EQPIPEphase  = 0+1+LATENCIESOFFSET; // longest stage // + 0 from Software adjustment
+    //macro expr EQPIPEphase  = 15+1+LATENCIESOFFSET; // longest stage // + 0 from Software adjustment
+    
+    macro expr WAIT_FOR_ORI = PIPEOri+2-8 + LATENCIESOFFSET; // + 10 from Software adjustment
+    
+    // Data
+    unsigned phiangle[NORIENTATIONS];
+    signed cosLUTphiangle[NORIENTATIONS];
+                                  	
+	signed (F_BITS*2) EnergyA[NORIENTATIONS], EnergyB[NORIENTATIONS];
+    unsigned (F_BITS*2) EnergyC[NORIENTATIONS];
+    unsigned SQRTCOREWIDTHIN meanEnergy;
+	
+	signed (ORIENTED_ENERGY_BITS+TRIG_BITS-1) OriA[NORIENTATIONS], OriB[NORIENTATIONS];    
+    signed (ACORI_BITS) AcNumOri[2], AcDenOri[2];
+    
+	
+   	signed ACPHI_BITS AcNumPhi[NORIENTATIONS], AcDenPhi[NORIENTATIONS], AcNumPhiBIS, AcDenPhiBIS;	
+    
+    signed fecopy[WAIT_FOR_ORI][NORIENTATIONS], focopy[WAIT_FOR_ORI][NORIENTATIONS];
+
+    //signal <signed (PHASE_ORI_BITS*2)> OriForPhaseSignal;
+    unsigned COSLUTCOREWIDTH OriForPhase;
+
+    
+    // Outputs
+    unsigned E[EQPIPEenergy];
+    signed PHASE_ORI_BITS OriAngle[EQPIPEOri], PhiAngle[NORIENTATIONS];
+    signed PHASE_ORI_BITS Phi[EQPIPEphase];
+
+	
+    // Cores control signals
+    static signal <unsigned 1> en0=0;
+    static signal <unsigned 1> en1=0;
+    static signal <unsigned 1> en2=0;
+    
+    unsigned int 1 rdy0, rdy1, rdy2;
+    
+    /*#ifdef DEBUG
+        int 16 auxOri;
+         chanin <signed 16> chanori with { infile= "C:/RC2000/RC2000Local_features/DKsimulator/matlab/ori.dk" }; 
+    #endif*/
+
+	par
+	{
+        // Enabling using signals the sqrt and atan2 cores for one clock cycle 
+        en0=1;
+        en1=1;
+        en2=1;		
+
+        // *********************************************************************  //
+		//              ENERGY
+        // *********************************************************************  //                    
+		par(o=0;o<NORIENTATIONS;o++)
+        {
+		// Pipe 1, Energy
+            xilinxmult(EnergyA[o], fe[o],fe[o]); 
+            xilinxmult(EnergyB[o], fo[o],fo[o]);
+
+        // Pipe 2, Energy	
+            EnergyC[o]=	(unsigned)EnergyA[o]+(unsigned)EnergyB[o]; 					
+		}
+
+		// Pipe 3, Energy BE CAREFULL WITH OVERFLOW EFFECTS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		meanEnergy=(UnSumMacro2(EnergyC,0,(NORIENTATIONS-1),(width(EnergyC[0])+3))\\(3+width(EnergyC[0])-width(meanEnergy)))<-width(meanEnergy);
+
+        
+        // Pipe 4-20, Energy. 
+        CoreSQRT(meanEnergy, en0, E[0], rdy0);  // 17 cycles latency
+        
+        // *********************************************************************  //
+		//              ORIENTATION
+        // *********************************************************************  //
+        // Pipe 3, Orientation WARNNING: EnergyC has two clock cycles delay to Ori
+        par(o=0;o<NORIENTATIONS;o++)	
+		{   //                                  34 to -> 28 bits, fractional part from 14 to 8 
+			xilinxmult(OriA[o], ((signed)adju((EnergyC[o]>>(width(EnergyC[0])-ORIENTED_ENERGY_BITS)),width(EnergyC[0])+2)), sin[o]); 
+			xilinxmult(OriB[o], ((signed)adju((EnergyC[o]>>(width(EnergyC[0])-ORIENTED_ENERGY_BITS)),width(EnergyC[0])+2)), cos[o]);           
+		}	//                                  28+9-1 (sign) = 36 bits for OriAB => +2 required               
+        
+        // Pipe 4, Orientation              acumulation lower than 2 (in fact is 2.8)
+        AcNumOri[0]=SumMacro(OriA,0,(NORIENTATIONS-1),(width(OriA[0])+1))\\(width(OriA[0])+1-width(AcNumOri[0]));
+		AcDenOri[0]=SumMacro(OriB,0,(NORIENTATIONS-1),(width(OriB[0])+1))\\(width(OriA[0])+1-width(AcNumOri[0]));
+		
+      
+        //assert (width(OriA)==3, 0, "Width of Ori is not 3 (it is %d)", width(OriA));
+		// Pipe 5, Orientation
+		AcNumOri[1]=AcNumOri[0]>>0; // core needs inputs in -1<=x<=1 format
+		AcDenOri[1]=AcDenOri[0]>>0; // PERHAPS >>1 is needed in case the whole range be used
+ 
+        //assert (width(OriAngle[1])==3, 0, "Width of x is not 3 (it is %d)", width(OriAngle[1]));
+
+        // Pipe 6-40, Orientation (atan2 core latency =35)
+        CoreATAN2CORDICORI(AcNumOri[1], AcDenOri[1], en1, OriAngle[0], rdy1); 
+        
+        // *********************************************************************  //
+		//              PHASE
+        // *********************************************************************  //				
+		// Pipe synchronization, waiting orientation data and storing filters outputs
+      /*  par(k=0; k<WAIT_FOR_ORI;k++)
+        {   
+            ifselect(k==0)      
+            {
+                par(o=0; o<NORIENTATIONS;o++)
+                {
+                    fecopy[k][o]=fe[o];
+                    focopy[k][o]=fo[o];
+                }
+            }
+            else    
+            {
+                par(o=0; o<NORIENTATIONS;o++)
+                {
+                    fecopy[k][o]=fecopy[k-1][o];
+                    focopy[k][o]=focopy[k-1][o];
+                }
+            }
+        }  
+        
+        
+        //  WAIT_FOR_ORI + 1 cycles   // 2QN x 2QN => 5QN (duplicated sign bit)
+        //xilinxmult(OriForPhaseSignal, (signed PHASE_ORI_BITS)  PI, ((OriAngle[0])>>2)); 
+        // Note that for coherence we go back to the 2QN format for OriAngle[0]
+        
+        // Core input range 0-> 2pi (core input = 1024*angle(rad)/2pi) => because our max input is pi we have to /2
+        OriForPhase=(((unsigned)OriAngle[0])>>1)\\(width(OriAngle[0])-width(OriForPhase));
+                
+        par(o=0;o<NORIENTATIONS;o++)	
+		{   //  WAIT_FOR_ORI + 2 cycles     
+            if (OriForPhase>=angleLUT[o])
+                phiangle[o]=OriForPhase - angleLUT[o];
+            else 
+                phiangle[o]= angleLUT[o]- OriForPhase;
+            
+            //  WAIT_FOR_ORI + 3 cycles  
+            CoreCosLUT(phiangle[o], cosLUTphiangle[o]);
+            
+             //  WAIT_FOR_ORI + 4 cycles  
+			xilinxmult(AcNumPhi[o], focopy[WAIT_FOR_ORI-1-adju(Latencies[15:12],6)][o], cosLUTphiangle[o] ); 
+			xilinxmult(AcDenPhi[o], fecopy[WAIT_FOR_ORI-1-adju(Latencies[15:12],6)][o], abs(cosLUTphiangle[o]) ); 
+		}	//  
+        
+        //  WAIT_FOR_ORI + 5 cycles  
+		AcNumPhiBIS=SumMacro(AcNumPhi,0,(NORIENTATIONS-1),(ACPHI_BITS));
+		AcDenPhiBIS=SumMacro(AcDenPhi,0,(NORIENTATIONS-1),(ACPHI_BITS));        
+
+		  //  WAIT_FOR_ORI + 5-40 cycles  
+        CoreATAN2CORDICPHI(adjs(AcNumPhiBIS,ATAN2COREWIDTHIN), adjs(AcDenPhiBIS,ATAN2COREWIDTHIN), en2, Phi[0], rdy2);
+        
+        AcNumPhiBIS=SumMacro(fo,0,(NORIENTATIONS-1),(ACPHI_BITS));
+		AcDenPhiBIS=SumMacro(fe,0,(NORIENTATIONS-1),(ACPHI_BITS));        
+
+		  //  WAIT_FOR_ORI + 5-40 cycles  
+        CoreATAN2CORDICPHI(AcNumPhiBIS, AcDenPhiBIS, en2, Phi[0], rdy2);  */      
+        
+        // *********************************************************************  //
+		//              Pipe equalization and sending processed data
+        // *********************************************************************  //
+        
+        // Pipe 20-??        
+		par(i=0; i<(EQPIPEenergy-1);i++)
+		{
+			E[i+1]=E[i];
+		}
+
+		par(i=0; i<(EQPIPEOri-1);i++)
+		{
+			OriAngle[i+1]=OriAngle[i];
+		}
+        
+        /*par(i=0; i<(EQPIPEphase-1);i++)
+		{
+			Phi[i+1]=Phi[i];
+		}*/
+
+        Energy=(E[EQPIPEenergy-1-adju(Latencies[11:8],7)]> 0@TH) ? E[EQPIPEenergy-1-adju(Latencies[11:8],7)]\\1 : 0; //SetNAN(E[0]);
+        // Divide by 2 when reading in software because we use the double angle representation
+        Orientation=(E[EQPIPEenergy-1-adju(Latencies[11:8],7)]> 0@TH ) ?  OriAngle[EQPIPEOri-1-adju(Latencies[7:4],6)]\\1 : 0;//SetNAN(Orientation); 		        
+        //Phase=(E[EQPIPEenergy-1-adju(Latencies[11:8],7)]> 0@TH ) ? Phi[EQPIPEphase-1-adju(Latencies[3:0],4)]\\1 : SetNAN(Phase);       		
+	}
+}
+
+
+//                  COMPUTING PHASE, MAGNITUDE AND ORIENTATION
+// ************************************************************************************		
+macro proc Primitives_short(fe,fo,Energy, Orientation, TH ,Latencies)
+{
+	// Pipeline equalization delays.  
+    macro expr LATENCIESOFFSET = 15; // + 15 for Softwarre adjustment, 0 for final hardware implementation
+    macro expr PIPEenergy   = 17;      
+    //macro expr PIPEOri      = 40;
+    macro expr PIPEOri      = 40-35-3+1;
+    macro expr PIPEPhase    = PIPEOri+40;
+    macro expr EQPIPEenergy = PIPEPhase-PIPEenergy-1+1+LATENCIESOFFSET  ; // -1 from Software adjustment     
+    macro expr EQPIPEOri    = PIPEPhase-PIPEOri-1+1+LATENCIESOFFSET  ; // + 4 from Software adjustment  
+    macro expr EQPIPEphase  = 0+1+LATENCIESOFFSET; // longest stage // + 0 from Software adjustment
+    
+    macro expr WAIT_FOR_ORI = PIPEOri+2-8 + LATENCIESOFFSET; // + 10 from Software adjustment
+    
+    // Data
+    unsigned phiangle[NORIENTATIONS];
+    signed cosLUTphiangle[NORIENTATIONS];
+                                  	
+	signed (F_BITS*2) EnergyA[NORIENTATIONS], EnergyB[NORIENTATIONS];
+    unsigned (F_BITS*2) EnergyC[NORIENTATIONS];
+    unsigned SQRTCOREWIDTHIN meanEnergy;
+	
+	signed (ORIENTED_ENERGY_BITS+TRIG_BITS-1) OriA[NORIENTATIONS], OriB[NORIENTATIONS];    
+    signed (ACORI_BITS) AcNumOri[2], AcDenOri[2];
+    
+	
+   	signed ACPHI_BITS AcNumPhi[NORIENTATIONS], AcDenPhi[NORIENTATIONS], AcNumPhiBIS, AcDenPhiBIS;	
+    
+    signed fecopy[WAIT_FOR_ORI][NORIENTATIONS], focopy[WAIT_FOR_ORI][NORIENTATIONS];
+
+    unsigned COSLUTCOREWIDTH OriForPhase;
+
+    
+    // Outputs
+    unsigned 10 E[EQPIPEenergy];
+    unsigned PHASE_ORI_BITS OriAngle[EQPIPEOri][NORIENTATIONS];
+    signed PHASE_ORI_BITS Phi[EQPIPEphase];
+
+	
+    // Cores control signals
+    static signal <unsigned 1> en0=0;
+    static signal <unsigned 1> en1=0;
+    static signal <unsigned 1> en2=0;
+    
+    unsigned int 1 rdy0, rdy1, rdy2;
+
+	par
+	{
+        // Enabling using signals the sqrt and atan2 cores for one clock cycle 
+        en0=1;
+        en1=1;
+        en2=1;		
+
+        // *********************************************************************  //
+		//              ENERGY
+        // *********************************************************************  //                    
+		par(o=0;o<NORIENTATIONS;o++)
+        {
+		// Pipe 1, Energy
+            xilinxmult(EnergyA[o], fe[o],fe[o]); 
+            xilinxmult(EnergyB[o], fo[o],fo[o]);
+
+        // Pipe 2, Energy	
+            EnergyC[o]=	(unsigned)EnergyA[o]+(unsigned)EnergyB[o]; 					
+		}
+
+		// Pipe 3, Energy BE CAREFULL WITH OVERFLOW EFFECTS!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+		meanEnergy=(UnSumMacro2(EnergyC,0,(NORIENTATIONS-1),(width(EnergyC[0])+3))\\(3+width(EnergyC[0])-width(meanEnergy)))<-width(meanEnergy);
+
+        
+        // Pipe 4-20, Energy. 
+        CoreSQRT(meanEnergy, en0, E[0], rdy0);  // 17 cycles latency
+        
+        // *********************************************************************  //
+		//              ORIENTATION
+        // *********************************************************************  //
+        par(cnt=0; cnt<NORIENTATIONS; cnt++)
+        {
+            OriAngle[0][cnt]= adju(EnergyC[cnt]\\4, width(OriAngle[0][0]));
+        }
+        
+        
+        // *********************************************************************  //
+		//              PHASE
+        // *********************************************************************  //				
+		
+        // *********************************************************************  //
+		//              Pipe equalization and sending processed data
+        // *********************************************************************  //
+        
+        // Pipe 20-??        
+		par(i=0; i<(EQPIPEenergy-1);i++)
+		{
+			E[i+1]=E[i];
+		}
+
+		par(i=0; i<(EQPIPEOri-1);i++)
+		{
+			
+            par(cnt2=0; cnt2<NORIENTATIONS; cnt2++)
+            {
+                OriAngle[i+1][cnt2]=OriAngle[i][cnt2];
+            }
+        
+		}
+        
+        Energy=(E[EQPIPEenergy-1-adju(Latencies[11:8],6)]> 0@TH) ? E[EQPIPEenergy-1-adju(Latencies[11:8],6)]\\1 : 0; //SetNAN(E[0]);
+        // Divide by 2 when reading in software because we use the double angle representation
+        
+        par(cnt3=0; cnt3<NORIENTATIONS; cnt3++)
+        {
+            Orientation[cnt3]=(E[EQPIPEenergy-1-adju(Latencies[11:8],6)]> 0@TH ) ?  OriAngle[EQPIPEOri-1-adju(Latencies[7:4],6)][cnt3]\\1 : 0;//SetNAN(Orientation);
+        }
+	}
+}
+
+macro proc PhasePrimitive (fe, fo, phase, latency)
+{
+    macro expr maxlatency=80;
+    unsigned 1 rdy;
+    signal static unsigned 1 en=0;
+    signed ACPHI_BITS NumPhi, DenPhi, phi;
+    signed 9 phi9[maxlatency];
+    
+    par
+    {
+        en=1;
+        
+        NumPhi=SumMacro(fo,0,(NORIENTATIONS-1),(ACPHI_BITS));
+		DenPhi=SumMacro(fe,0,(NORIENTATIONS-1),(ACPHI_BITS));        
+
+		  //  WAIT_FOR_ORI + 5-40 cycles  
+        CoreATAN2CORDICPHI(NumPhi, DenPhi, en, phi, rdy); 
+        phi9[0]=phi\\1;
+        par(d=0;d<maxlatency-1;d++)
+        {
+            phi9[d+1]=phi9[d];
+        }
+        phase = phi9[latency-1];
+    }
+} 
+// *******************************************************************************
+//          AUXILIARY MACROS (ONLY BETA VERSIONS, UNDER TEST)
+// *******************************************************************************
+// *******************************************************************************
+
+// *******************************************************************************
+
+// *******************************************************************************
+//      Sorting input data
+// *******************************************************************************
+//          Input data must be signed
+macro proc Sort(bufferIn,bufferOut, bufferLength) 
+{       
+macro expr Retiming=1; // Retiming value = Retiming-1
+macro expr PipeLatency=3;
+macro expr DataWidth=(width(bufferIn[0]));
+//macro expr SumMacro(vector,begin,end,Extend)= select(end==begin, adju(vector[begin],Extend), 
+//                   adju(vector[end],Extend)+SumMacro(vector,begin,end-1,Extend));
+macro expr SumMacro(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom,adju(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1) + RecurseAddAux(Array, Middle, Bottom));
+        in
+            RecurseAddAux(Array, Index, begin);
+
+signed DataWidth bufferInternal[Retiming+1][bufferLength];
+unsigned 1 sum[bufferLength][(bufferLength-0)];             //  In fact is -1 but the compiler fails, 
+unsigned (log2ceil(bufferLength)) position[Retiming][bufferLength];   // time to synthizer optimization
+    
+    par(i1=0;i1<bufferLength;i1++)
+    {
+        par(i2=0;i2<(bufferLength-1);i2++)
+        {
+            //assert ((MAX_RES_X/SCALE)  == 24, 0, "Application requires %d",width(i2));
+            // Comparisons
+            ifselect(i1>(0@i2))    // left side
+            {
+                if(bufferIn[i1]>bufferIn[0@i2])
+                    sum[i1][i2]=1;
+                else
+                    sum[i1][i2]=0;
+            }
+            else  //ifselect(i1<=i2)  // i1<=i2, right side, except center pixel
+            {
+                if(bufferIn[i1]>=bufferIn[0@i2+1])
+                    sum[i1][i2]=1;
+                else
+                    sum[i1][i2]=0;            
+            }
+            bufferInternal[0][i1]=bufferIn[i1];
+            
+            // Positions estimation 
+            position[0][i1]=SumMacro(sum[i1],0,(bufferLength-2),width(position[0]));            
+            bufferInternal[1][i1]=bufferInternal[0][i1];
+            
+            // Retiming
+            /*par(t=1;t<Retiming;t++)
+            {
+                position[t][i1]=position[t-1][i1];
+                bufferInternal[t+1][i1]=bufferInternal[t][i1];
+            }*/
+                        
+            // Sorting vector
+            bufferOut[position[Retiming-1][i1]]=bufferInternal[Retiming][i1];             
+        }
+    }
+}
+
+
+
+// *******************************************************************************
+//      Sorting input data with invalid values
+// *******************************************************************************
+//          Input data must be signed
+macro proc SortNaN(bufferIn,bufferOut, bufferLength, offset) 
+{       
+macro expr Retiming=1; // Retiming value = Retiming-1
+macro expr PipeLatency=3;
+macro expr DataWidth=(width(bufferIn[0]));
+//macro expr SumMacro(vector,begin,end,Extend)= select(end==begin, adju(vector[begin],Extend), 
+//                   adju(vector[end],Extend)+SumMacro(vector,begin,end-1,Extend));
+macro expr SumMacro(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom,adju(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1) + RecurseAddAux(Array, Middle, Bottom));
+        in
+            RecurseAddAux(Array, Index, begin);
+
+signed DataWidth bufferInternal[Retiming+1][bufferLength];
+unsigned 1 sum[bufferLength][(bufferLength-0)];             //  In fact is -1 but the compiler fails, 
+unsigned (log2ceil(bufferLength)) position[Retiming][bufferLength];   // time to synthizer optimization
+unsigned (log2ceil(bufferLength)) NumInvalid;
+unsigned 1 SumInvalid[bufferLength];
+    
+    par(i1=0;i1<bufferLength;i1++)
+    {
+        par(i2=0;i2<(bufferLength-1);i2++)
+        {
+            //assert ((MAX_RES_X/SCALE)  == 24, 0, "Application requires %d",width(i2));
+            // Comparisons
+            ifselect(i1>(0@i2))    // left side
+            {
+                if(bufferIn[i1]>bufferIn[0@i2])
+                    sum[i1][i2]=1;
+                else
+                    sum[i1][i2]=0;
+            }
+            else  //ifselect(i1<=i2)  // i1<=i2, right side, except center pixel
+            {
+                if(bufferIn[i1]>=bufferIn[0@i2+1])
+                    sum[i1][i2]=1;
+                else
+                    sum[i1][i2]=0;            
+            }
+            bufferInternal[0][i1]=bufferIn[i1];
+            
+            // Positions estimation 
+            position[0][i1]=SumMacro(sum[i1],0,(bufferLength-2),width(position[0]));            
+            bufferInternal[1][i1]=bufferInternal[0][i1];
+            
+            // Retiming
+            /*par(t=1;t<Retiming;t++)
+            {
+                position[t][i1]=position[t-1][i1];
+                bufferInternal[t+1][i1]=bufferInternal[t][i1];
+            }*/
+                        
+            // Sorting vector
+            bufferOut[position[Retiming-1][i1]]=bufferInternal[Retiming][i1];
+
+            // counting invalid values
+            if(bufferInternal[0][i1]==0b100000000000)
+                SumInvalid[i1]=1;
+            else
+                SumInvalid[i1]=0;
+            
+            offset = SumMacro(SumInvalid,0,(bufferLength-1),width(offset));
+
+        }
+    }
+}
+
+// ************************************************************************************
+//                  MEDIAN FITLERING FOR IMAGE SALT & PEPPER DENOISING
+// ************************************************************************************
+macro proc Median(Input, Output, ColumnLength)
+{    
+    macro expr PipeLatency=  1 +1 + 3;  // 1 input, 1 output,3  sorting data
+    macro expr NTaps=3;        
+    macro expr Retiming=1;      // Retiming value = Retiming-1    
+
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];      //  Read port
+     	wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	 //  Write port
+	} ColumnsBuffer[NTaps-1] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+    
+    static unsigned (log2ceil((MAX_RES_X/SCALE))) col=((MAX_RES_X/SCALE)+1 -2-1-4), colbis=((MAX_RES_X/SCALE) -2-1-8);
+    signed (width(Input)) DataMatrix[NTaps][NTaps], bufferIn[(NTaps*NTaps)], bufferOut[(NTaps*NTaps)];
+    
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+       //assert (1  == 24, 0, "Application requires %d",log2ceil(8));
+
+       	// Updating matrix. Read data into array every cycle           
+       	par (r = 0; r != NTaps; r++)
+       	{
+            par (c = 0; c != NTaps; c++)
+            {
+                ifselect(c==0) 
+                {
+                    ifselect(r==0)
+                        DataMatrix[0][0]=Input;  // Read new data             
+                    else
+                    {
+                        DataMatrix[r][c] = readRAM((r-1)<-log2ceil(NTaps-1),col);							
+                    }
+                }
+                else    // shift data through the matrix
+                {
+                    DataMatrix[r][c]=DataMatrix[r][c-1];
+                }
+
+
+            }
+       	}
+   		// Storing previous data
+        par(r1=0;r1!=(NTaps-1);r1++)
+   		{
+            writeRAM(r1,colbis,DataMatrix[0@r1][0]);						        
+   		}																							                                
+        
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        
+        // Operations by columns                
+        col= col==(ColumnLength-1) ? 0 : col+1;
+  		colbis= col;
+
+        // Sorting data and median filtering
+        par(r2=0;r2!=NTaps;r2++)
+        {
+            par(c2=0;c2!=NTaps;c2++)
+            {
+                bufferIn[adju(r2,log2ceil(NTaps*NTaps))*NTaps+adju(c2,log2ceil(NTaps*NTaps))]=DataMatrix[r2][c2];
+            }
+        }        
+        
+        Sort(bufferIn,bufferOut, (NTaps*NTaps));
+        Output=bufferOut[(NTaps*NTaps)/2]; 
+					
+        /*par(i=1;i<(Retiming);i++)
+        {
+            aux[i]=aux[i-1];
+        }
+        Output= aux[Retiming-1];*/
+
+    } // End Global par
+}
+
+
+/*
+%   Input			- Input value for the convolution
+%	Output			- Result of the convolution
+%   KernelX			- Kernel for the X convolution
+%	KernelY			- Kernel for the Y convolution
+%	ColumnLength	- Number of elements of each column
+%
+%	DESCRIPTION
+%				This function computes the separable 2D convolution of the input. 
+%				It stores 4 columns before performing it, with the current column 
+%				they are 5. Then, the convolution is carried out using KernelX for
+%				for the rows and KernelY for the columns.
+%
+% RETURN
+%   
+*/
+macro proc SpatialConvolutions_last(Input,Output,KernelX,KernelY, ColumnLength)
+{
+    macro expr PipeLatency=6 + 2;
+    macro expr Retiming=1;  // Retiming value = Retiming-1
+ 	//const unsigned int col_size=log2ceil(ColumnLength);
+    
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];      //  Read port
+        wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	    //  Write port
+
+    } ColumnsBuffer[4] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+
+    signed (width(Input)) DataArrayX[5], DataArrayY[5] ;
+    //static unsigned (log2ceil((VIDEOINCOLUMNS/SCALE))) col=1, colbis=0;
+    //static unsigned (log2ceil((ColumnLength))) col=1, colbis=0;
+    static unsigned (log2ceil(MAX_RES_X/SCALE)) col=1, colbis=0;
+    signed (width(Output)) aux[Retiming];
+    
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+        // Read data into array every cycle             
+    	DataArrayX[4]=Input;
+            
+    	// Shift X data through array
+    	par (i = 0; i != 4; i++)
+    	{
+    	    DataArrayX[i] = DataArrayX[i+1];
+    	}
+        KernelX(DataArrayX,DataArrayY[4]);     
+        
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        
+        // Operations by columns                
+        col= col>=(ColumnLength-1) ? 0 : col+1;
+		colbis= col;
+					
+		// Read data into array every cycle 
+		par(r1=0;r1!=4;r1++)
+		{
+			// Fill data through array
+			DataArrayY[r1] = readRAM(adju(r1,3),col);						
+		}	
+
+		// Shift array and write data into block RAMs every cycle 
+		par(r2=0;r2!=4;r2++)
+		{
+			writeRAM(adju(r2,3),colbis,DataArrayY[r2+1]);						
+		}																							                        
+
+        KernelY(DataArrayY,Output);        
+        
+        /*par(i=1;i<(Retiming);i++)
+        {
+            aux[i]=aux[i-1];
+        }
+        Output= aux[Retiming-1];*/
+               
+
+    } // End Global par
+}
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the kernel
+%				k = [2 16 28 16 2]/64. It is a band-pass filter. The Retiming is not used 
+%				for the implementation because the performance was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Prefilter5Taps(buffer,Out) // mask=[2 16 28 16 2]/64
+{       
+macro expr Retiming=1; // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=6; 
+macro expr DataWidth=(width(buffer[0])+6);
+
+signed DataWidth Register[3], aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+        Register[0]=(adjs(buffer[0],DataWidth)+adjs(buffer[4],DataWidth))<<1;
+        Register[1]=(adjs(buffer[1],DataWidth)+adjs(buffer[3],DataWidth))<<4;
+        Register[2]=(adjs(buffer[2],DataWidth))*28;
+        //xilinxmult(Register[2], (adjs(buffer[2],DataWidth)) ,((int 18) 28) );			 					          
+
+        aux0= Register[0] + Register[1] + Register[2];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+
+        //par(i=1;i<(Retiming);i++)
+        //{
+        //    aux[i]=aux[i-1];
+        //}
+        Out= aux[Retiming-1];
+        //Out=buffer[0];
+    }
+}
+
+/* 
+%   Num			- Numerator
+%   Den			- Denominator
+%   Result		- Quotient
+%	
+%	DESCRIPTION
+%				This function computes the division of Num and Den, obtaining the
+%				the quotient that is returned in result. It can be done using the
+%				standard Handel-C implementation, as simply result = Den/Num.
+%				The problem is that the performance is affected by the required 
+%				logic and resources. This is why we are using a core from 
+%				the core Generator. The interface is divider_18 because we are using
+%				18 bits for the division to obtain a better precision.
+%
+% RETURN
+%   
+*/
+macro proc division_core(Num, Den, result)
+{	
+    // Enable for Cores
+    static signal unsigned 1 enable=0;
+    //signed DIVIDER_INPUT quot2;
+    
+    interface divider_18(signed DIVIDER_INPUT quot, signed DIVIDER_INPUT remd, unsigned 1 rfd) divider(signed  DIVIDER_INPUT dividend = Num, 
+							signed DIVIDER_INPUT divisor = adjs(Den,DIVIDER_INPUT), unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+    
+    par
+    {          
+        //Enabling division Core: only for 1 clock cycle
+        enable=1;
+        
+        result = divider.quot;
+        //quot2 = Num/Den;
+        
+        
+        //quot = qout2;
+    }
+}
+
+
+/* 
+%   Input			- Input value for the convolution
+%   Output			- Delayed input value
+%   ColumnLength	- Number of elements of each column
+%	
+%	DESCRIPTION
+%				This function computes delays the input as many cycles as the
+%				function SpatialConvolutions_last. It is used for synchronization
+%
+% RETURN
+%   
+*/
+macro proc Delaying(Input, Output, ColumnLength)
+{
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];      //  Read port
+        wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	    //  Write port
+
+    } ColumnsBuffer[6] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+
+    signed (width(Input)) DataArrayX[7], DataArrayY[7] ;
+    static unsigned (log2ceil(MAX_RES_X/SCALE)) col=1, colbis=0;
+    
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+        // Read data into array every cycle             
+    	DataArrayX[6]=Input;
+            
+    	// Shift X data through array
+    	par (i = 0; i != 6; i++)
+    	{
+    	    DataArrayX[i] = DataArrayX[i+1];
+    	}
+        
+        DataArrayY[6] = DataArrayX[4];
+        
+        
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        
+        // Operations by columns                
+        col= col>=(ColumnLength-1) ? 0 : col+1;
+		colbis= col;
+					
+		// Read data into array every cycle 
+		par(r1=0;r1!=6;r1++)
+		{
+			// Fill data through array
+			DataArrayY[r1] = readRAM(adju(r1,3),col);						
+		}	
+
+		// Shift array and write data into block RAMs every cycle 
+		par(r2=0;r2!=6;r2++)
+		{
+			writeRAM(adju(r2,3),colbis,DataArrayY[r2+1]);						
+		}																							                        
+
+        Output=DataArrayY[3];        
+    
+    } // End Global par
+}
\ No newline at end of file
--- a/attention/attention_v0.1/GaborPrimitives.hch
+++ b/attention/attention_v0.1/GaborPrimitives.hch
+//********************************************************************
+// 
+//  Programmed by Javier Díaz, DRIVSCO project
+//  Granada, March 2008, version 2.1
+// 
+//********************************************************************
+#ifndef __GABORPRIMITIVES__
+#define __GABORPRIMITIVES__
+
+#include <stdlib.hch>
+#include "generic.hch"
+#include "parameters.hch"
+
+
+// Data bit-widths 
+//*****************************************
+#define KERN_BITS	 12  //14
+
+#define CONV_BITS 	 10
+#define CONV_FRACT_BITS 0
+
+#define F_BITS 	  	(CONV_BITS)  	// USE CONV_BITS+1 FOR OPTICAL FLOW AND STEREO OR 17 FOR LOCAL FEATURES
+#define TRIG_BITS	 9 
+
+#define ORIENTED_ENERGY_BITS 20
+#define ACORI_BITS	(24)	
+
+#define PHASE_ORI_BITS 	 10     // 16 for hardware, 64 for debugging
+#define ENER_BITS   10  	    // 16 for hardware, 32 for debugging
+
+#define ACPHI_BITS	10//(COSLUTCOREWIDTH+F_BITS+3)	 // in fact this is larger than the software simulator
+//#define PI 25736  // it uses 16 bits in format 2QN (1 bit sign, 2 bit integer part, 13 bits fractional part)
+#define PI 201  // it uses 10 bits ( 6 bits of frac part)
+
+// SQRT CORE (SCALED RADIANS 2Q24 FORMAT (1 for sign, 2 as integer part and the others as fractional part)
+#define SQRTCOREWIDTHIN 20
+#define SQRTCOREWIDTHOUT 10
+#define SQRTCORELATENCY  (SQRTCOREWIDTHOUT+2)
+#define SQRTCORENAME sqrtcordic
+
+// ARC TAN CORE
+#define ATAN2COREWIDTHIN 24
+#define ATAN2COREWIDTHOUT 24
+#define ATAN2CORELATENCY (ATAN2COREWIDTHOUT+4) 
+#define ATAN2CORENAME atan2cordic24
+
+// ARC TAN CORE PHASE
+#define ATAN2COREWIDTHIN_P 10   //32
+#define ATAN2COREWIDTHOUT_P 10   //32
+#define ATAN2CORELATENCY_P (ATAN2COREWIDTHOUT_P+4) 
+#define ATAN2CORENAME_P atan2cordic10//atan2cordic24
+
+// cosLUT CORE
+#define COSLUTCOREWIDTH 10
+#define COSLUTCORENAME cosLUT
+
+/* 		BIT CONFIGUATION EXAMPLES
+*****************************************
+
+1)	#define KERN_BITS	 11
+	#define CONV_BITS 	  9
+
+	#define ATAN2COREWIDTH 	 20
+	#define ATAN2CORENAME atan2cordic20
+
+2)	#define KERN_BITS	 13
+	#define CONV_BITS 	 11
+
+	#define ATAN2COREWIDTH 	 24
+	#define ATAN2CORENAME atan2cordic24
+
+3)	#define KERN_BITS	 15 / 17 / 17 / 19 / 21
+	#define CONV_BITS 	 14 / 16 / 18 / 20 / 22
+
+	#define ATAN2COREWIDTH 	 30 / 34 / 38 / 42 / 46
+	#define ATAN2CORENAME atan2cordic30 / atan2cordic34 / atan2cordic38 / atan2cordic42 / atan2cordic46
+*/
+
+
+// Extra parameters
+#define NORIENTATIONS 8
+//#define PI 201 // 3.14 (3 bit integer, 6 bit fractional)
+//#define NAN 0b100000000000
+//#define NSCALES 1
+#define MAX_PROC_DISPARITY 2
+#define MAX_PROC_FLOW 3
+#define FLOW_INDEX_BITS 2
+#define DISPARITY_INDEX_BITS 1
+#define DIVIDER_INPUT 18
+#define DIVIDER_LATENCY DIVIDER_INPUT+4 // is +4 if divider has clks/div==1
+
+
+// Generic Macros
+macro expr SumMacro(Array, begin, Index,Extend);
+
+//            Computing Macros
+/*********************************************************************/
+macro proc GenericConvolution(Input, Output, X_FIR, Y_FIR, NTaps, NTapsMinus1, ColumnLength,normx, normy,Sx,Sy);
+macro proc GaborY(Input, FNY, NTaps, NTapsMinus1, ColumnLength);
+macro proc GaborBase(DataIn, FNYNX,Columns);
+macro proc BuildGabor(FNYNX,fe,fo);
+macro proc SortNaN(bufferIn,bufferOut, bufferLength, offset);
+macro proc PhasePrimitive (fe, fo, phase, latency);
+macro proc Primitives(fe,fo,Energy, Orientation, TH ,Latencies);
+macro proc Primitives_short(fe,fo,Energy, Orientation, TH ,Latencies);
+
+//          Added macros (F Barranco)
+/*********************************************************************/
+macro proc SpatialConvolutions_last(Input,Output,KernelX,KernelY, ColumnLength);
+macro proc Delaying(Input, Output, ColumnLength);
+macro proc Prefilter5Taps(buffer,Out);
+macro proc division_core(Num, Den, result);
+
+#endif
\ No newline at end of file
--- a/attention/attention_v0.1/README
+++ b/attention/attention_v0.1/README
--- a/attention/attention_v0.1/channels.hcc
+++ b/attention/attention_v0.1/channels.hcc
+/* channels.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+#include "channels.hch"
+
+// ***************************************************************
+// 		Channels implemented using signals
+// ***************************************************************
+
+
+/* 
+%   Channel		- Send data through this channel
+%	Input		- Data to be sent through the channel
+%
+%	DESCRIPTION
+%				This function sends Input through Channel.
+%				Channels are structs declared
+%				in channels.hcc file
+%
+% RETURN
+%   
+*/
+macro proc Send(Channel, Input)
+{
+    //register indicating that the procedure has completed
+    unsigned 1 done;
+
+    //do this at least once
+    do
+    {
+        par
+        {
+            //set the transfer wires to the input value
+            Channel.DataTransfer = Input;
+            //indicate that the send process is ready
+            Channel.SendReady = 1;
+            //set the done register if the read process is ready
+            done = Channel.ReadReady;
+        }
+    }while(!done); //until the transfer is complete
+}
+
+/* 
+%   Channel		- Send signed data through this channel 
+%	Input		- Data to be sent through the channel
+%
+%	DESCRIPTION
+%				This function safely sends Input through Channel: to work, there must be 
+%				a channel reading in the other side. It waits until the ready signal is activated
+%				and then sends the correct data. Otherwise, it is sending 0. 
+%				Channels are structs declared in channels.hcc file
+%
+% RETURN
+%   
+*/
+macro proc SignedSecureSend(Channel, Input)
+{
+	signed auxInput;
+
+	if (Read_Ready(Channel))
+		Send(Channel, Input);
+	else
+	{
+		auxInput=Input;
+		Send(Channel, auxInput);   
+	}   
+}
+
+/* 
+%   Channel		- Send unsigned data through this channel 
+%	Input		- Data to be sent through the channel
+%
+%	DESCRIPTION
+%				This function safely sends Input through Channel: to work, there must be 
+%				a channel reading in the other side. It waits until the ready signal is activated
+%				and then sends the correct data. Otherwise, it is sending 0. 
+%				Channels are structs declared in channels.hcc file
+%
+% RETURN
+%   
+*/
+macro proc UnsignedSecureSend(Channel, Input)
+{
+	unsigned auxInput;
+
+	if (Read_Ready(Channel))
+		Send(Channel, Input);
+	else
+	{
+		auxInput=Input;
+		Send(Channel, auxInput);   
+	}   
+}
+
+/* 
+%   Channel		- Receive data coming through this channel 
+%	Output		- Data to be received through the channel
+%
+%	DESCRIPTION
+%				This function safely receives Output through Channel. 
+%				It waits until the ready signal is activated (meaning that 
+%				the sending part is ready) and then receives the data. 
+%				This function is blocked until the reception of the first 
+%				transference.
+%				Channels are structs declared in channels.hcc file
+%
+% RETURN
+%   
+*/
+macro proc Receive(Channel, Output)
+{
+    //register indicating that the procedure has completed
+    unsigned 1 done;
+
+    //do this at least once
+    do
+    {
+        par
+        {
+            //is the send process is ready
+            if (Channel.SendReady)
+            {
+                //ready the value on the data transfer wires
+                Output = Channel.DataTransfer;
+            }
+            else
+                delay;
+
+            //indicate that the receive process is ready
+            Channel.ReadReady = 1;
+            //set the done register if the send process is ready
+            done = Channel.SendReady;
+        }
+    }while(!done); //until the transfer is complete
+}
+
+
+/* 
+%   Channel		- Channel  
+%
+%	DESCRIPTION
+%				This function checks whether the sender is ready or not.
+%				Channels are structs declared in channels.hcc file
+%
+% RETURN
+%
+%	SendReady 	- Signal that is active if the sender is ready to transmit data 
+%   
+*/
+macro expr Send_Ready(Channel) = Channel.SendReady;
+
+
+/* 
+%   Channel		- Channel  
+%
+%	DESCRIPTION
+%				This function checks whether the receiver is ready or not.
+%				Channels are structs declared in channels.hcc file
+%
+% RETURN
+%
+%	ReadReady 	- Signal that is active if the receiver is ready to receive data 
+%   
+*/
+macro expr Read_Ready(Channel) = Channel.ReadReady;
\ No newline at end of file
--- a/attention/attention_v0.1/channels.hch
+++ b/attention/attention_v0.1/channels.hch
+/* channels.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+
+#ifndef __CHANNELS__
+#define __CHANNELS__
+
+#include "stdlib.hch"
+
+// 		Channels implemented using signals
+// ***************************************************************
+struct unsignedchannel
+{
+    signal unsigned 1 ReadReady;
+    signal unsigned 1 SendReady;
+    signal unsigned DataTransfer;
+};
+
+struct signedchannel
+{
+    signal unsigned 1 ReadReady;
+    signal unsigned 1 SendReady;
+    signal signed DataTransfer;
+};
+
+// Definition of a channel with default values of 0
+#define UNSIGNED_CHANNEL static struct unsignedchannel
+#define SIGNED_CHANNEL static struct signedchannel
+//	Example channel declaration: declare a variable MyChannel 
+//	as channel structure with default value of zero
+//	UNSIGNED_CHANNEL MyChannel;
+
+macro proc Send(Channel, Input);
+macro proc SignedSecureSend(Channel, Input);
+macro proc UnsignedSecureSend(Channel, Input);
+macro proc Receive(Channel, Output);
+
+// These expressions allow the user to implement non-blocking channels: 
+// This channel structure has the readiness of the send and 
+// receive process exposed as signals, allowing the user to check
+// the status of a channel. This can be simply expressed as 
+// expressions in Handel-C thus:
+
+//Check whether the sender is ready
+macro expr Send_Ready(Channel);
+
+//Check whether the receiver is ready
+macro expr Read_Ready(Channel);
+
+#endif
+
--- a/attention/attention_v0.1/cores.hcc
+++ b/attention/attention_v0.1/cores.hcc
+/* cores.hcc
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#include "cores.hch"
+
+// 		Interfaces for the top and core projects
+// ***************************************************************
+
+/* 
+%   Input		- Input channel for the 3 pixels of 8 bits (from the 3 RGB channels)
+%   Output		- Local descriptor feature maps (27 bits)
+%   Control		- Control word with the different parameters:
+%             		* Control[60:45]	- Latencies for the feature estimation (gabor modules)
+%             		* Control[44:36]	- Thresholds for the feature estimation 
+%             		* Control[35:26]	- Number of columns of the input images
+%             		* Control[24:21]	- Not used
+%             		* Control[20:13]	- Not used 
+%             		* Control[12:0]		- Latency cycles of the pipeline 
+%	ImSize		- Size of the input images
+%
+%	DESCRIPTION
+%				Interface for a top architecture to interface with the attention estimation core
+% RETURN
+%   
+*/
+macro proc InterfazTopFlowCore_lf_attention(Input, Output, Control, ImSize)
+{
+macro expr InWidth=24;  //192;//24;
+//macro expr OutWidth=51;//24;
+macro expr OutWidth=27;//18; 9 bits are useless
+interface CoreOpticFlow( signal OutWidth CoreOut, signal unsigned 1 OutSendReady, signal unsigned 1 InReadReady) 
+	MyCore( unsigned 1 clk=__clock, unsigned imSize=ImSize, signal InWidth CoreIn=Input.DataTransfer, 
+	signal unsigned 1 InSendReady=Input.SendReady,
+    signal unsigned 1 OutReadReady=Output.ReadReady, unsigned cmd=Control)with{retime=0};
+
+	while(1)
+	{
+		par
+		{
+			Output.DataTransfer=MyCore.CoreOut;
+			Output.SendReady=MyCore.OutSendReady;
+			Input.ReadReady=MyCore.InReadReady;
+		}
+	}
+
+}
+
+/* 
+%   Input		- Input channel for the 3 pixels of 8 bits (from the 3 RGB channels)
+%   Output		- Local descriptor feature maps (27 bits)
+%   Control		- Control word with the different parameters:
+%             		* Control[60:45]	- Latencies for the feature estimation (gabor modules)
+%             		* Control[44:36]	- Thresholds for the feature estimation 
+%             		* Control[35:26]	- Number of columns of the input images
+%             		* Control[24:21]	- Not used
+%             		* Control[20:13]	- Not used 
+%             		* Control[12:0]		- Latency cycles of the pipeline 
+%	ImSize		- Size of the input images
+%
+%	DESCRIPTION
+%				Interface for the attention estimation core (used in the main.hcc)
+%
+% RETURN
+%   
+*/
+macro proc InterfazCore_lf_attention(Input, Output, Control,ImSize)
+{
+#if CORE==1
+	// Outcoming data
+	interface port_out() OutData(signal CoreOut = Output.DataTransfer)with{retime=0};	
+	interface port_out() OutSendStatus(signal unsigned 1 OutSendReady = Output.SendReady)with{retime=0};	
+	interface port_in(signal unsigned 1 OutReadReady) OutReadStatus()with{retime=0} ;
+
+
+	// Incoming data
+    //	interface port_in(unsigned 1 clk with {clockport = 1}) ClockPort() ;
+	interface port_in(unsigned  imSize) CimSize()with{retime=0};
+    interface port_in(signal  CoreIn) InData()with{retime=0};	
+	interface port_in(signal unsigned 1 InSendReady) InSendStatus()with{retime=0};
+	interface port_out() InReadStatus(signal unsigned 1 InReadReady = Input.ReadReady)with{retime=0};	
+
+    // Control & Commands
+    interface port_in(unsigned  cmd) Control_Commands()with{retime=0};
+#else
+	// Outcoming data
+	interface bus_out() OutData(signal CoreOut = Output.DataTransfer)with{retime=0};	
+	interface bus_out() OutSendStatus(signal unsigned 1 OutSendReady = Output.SendReady)with{retime=0};	
+	interface bus_in(signal unsigned 1 OutReadReady) OutReadStatus() with{retime=0};
+
+
+	// Incoming data
+    //	interface port_in(unsigned 1 clk with {clockport = 1}) ClockPort() ;
+	interface bus_in(unsigned  imSize) CimSize()with{retime=0};
+	interface bus_in(signal  CoreIn) InData()with{retime=0};	
+	interface bus_in(signal unsigned 1 InSendReady) InSendStatus()with{retime=0};
+	interface bus_out() InReadStatus(signal unsigned 1 InReadReady = Input.ReadReady)with{retime=0};	
+
+    // Control & Commands
+    interface bus_in(unsigned  cmd) Control_Commands()with{retime=0};
+#endif
+	while(1)
+	{
+		par
+		{
+			Output.ReadReady=OutReadStatus.OutReadReady;
+			Input.DataTransfer=InData.CoreIn;
+			Input.SendReady=InSendStatus.InSendReady;
+            Control=Control_Commands.cmd;
+			ImSize=CimSize.imSize;            
+		}
+	}
+
+}
--- a/attention/attention_v0.1/cores.hch
+++ b/attention/attention_v0.1/cores.hch
+/* cores.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#ifndef __CORES__
+#define __CORES__
+
+#include "stdlib.hch"
+#include "channels.hch"
+//#include "xircav4_lib.hch" //Platform-dependent
+
+#define CORE 1 // 0 for sub-circuit test, 1 for core calls
+
+//Attention cores
+macro proc InterfazCore_lf_attention(Input, Output, Control,ImSize);
+macro proc InterfazTopFlowCore_lf_attention(Input, Output, Control, ImSize);
+
+
+#endif
\ No newline at end of file
--- a/attention/attention_v0.1/generic.hcc
+++ b/attention/attention_v0.1/generic.hcc
+/* generic.hcc
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+#include "generic.hch"
+
+
+// Pipeline synchronization delays
+/* 
+%   DelayCycles		- Number of cycles of the delay
+%
+%	DESCRIPTION
+%				This function sequentially generates the number of cycles that 
+%				is passed in DelayCycles. It can be used for synchronization.
+%
+% RETURN
+%   
+*/
+macro proc PipelineDelay(DelayCycles)
+{    
+    seq(t=0;t<(DelayCycles);t++)
+    {
+        delay;
+    }
+}
+
+/* 
+%   input		- Input data
+%
+%	DESCRIPTION
+%				This function creates a NaN valid. The value will depend on
+%				the width of the input. It will be 1 followed by as many zeros 
+%				as the size of input minus 1.
+%
+% RETURN
+%				The NaN value for the width of input.
+%   
+*/
+macro expr SetNAN(input) = 1<<(width(input)-1);
--- a/attention/attention_v0.1/generic.hch
+++ b/attention/attention_v0.1/generic.hch
+/* generic.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#ifndef __GENERIC_HCH__
+#define __GENERIC_HCH__ 
+
+#include "stdlib.hch"
+#include "parameters.hch"
+#include "cores.hch"
+#include "channels.hch"
+#include "bilinear_warping_v2.hch"
+
+static struct SECURE_FIFO_CHANNEL_INTERFACE_12
+{
+    signal unsigned 1 wren;
+    signal unsigned 1 rden;
+    signal unsigned 12 data_w;
+    signal unsigned 12 data_r;
+    signal unsigned 1 full;
+    signal unsigned 1 empty;
+};
+
+#define SECURE_FIFO_CHANNEL_12 static struct SECURE_FIFO_CHANNEL_INTERFACE_12
+
+    
+macro proc SecureFifoChannel_12(PtrInterface);
+macro proc MyFIFORead_12(PtrInterface, data);
+macro proc MyFIFOWrite_12(PtrInterface, data);
+macro expr SetNAN(input);
+
+#endif
\ No newline at end of file
--- a/attention/attention_v0.1/lklib.hcc
+++ b/attention/attention_v0.1/lklib.hcc
+/* lklib.hcc
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#include "lklib.hch"
+#include "cores.hch"
+#include "parameters.hch"
+
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the kernel
+%				k = [2 16 28 16 2]/64. It is a band-pass filter. The Retiming is not used 
+%				for the implementation because the performance was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Prefilter5Taps(buffer,Out) // mask=[2 16 28 16 2]/64
+{       
+macro expr Retiming=1; // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=6; 
+macro expr DataWidth=(width(buffer[0])+6);
+
+signed DataWidth Register[3], aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+        Register[0]=(adjs(buffer[0],DataWidth)+adjs(buffer[4],DataWidth))<<1;
+        Register[1]=(adjs(buffer[1],DataWidth)+adjs(buffer[3],DataWidth))<<4;
+        Register[2]=(adjs(buffer[2],DataWidth))*28;        
+
+        aux0= Register[0] + Register[1] + Register[2];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+
+        //par(i=1;i<(Retiming);i++)
+        //{
+        //    aux[i]=aux[i-1];
+        //}
+        Out= aux[Retiming-1];
+    }
+}
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the kernel
+%				k = [14 35 14]/64. It is a band-pass filter. The Retiming is not used 
+%				for the implementation because the performance was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Prefilter3Taps(buffer,Out) // mask=[14 35 14]/64
+{       
+macro expr Retiming=1; // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=6; 
+macro expr DataWidth=(width(buffer[0])+7);
+
+signed DataWidth Register[2], aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+        Register[0]=(adjs(buffer[0],DataWidth)+adjs(buffer[2],DataWidth))*14;
+        Register[1]=(adjs(buffer[1],DataWidth))*35;
+        
+        aux0= Register[0] + Register[1];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+
+        Out= aux[0];
+    }
+}
+
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the derivative 
+%				kernel k = [7 18 0 -18 -7]/64. The Retiming is not used 
+%				for the implementation because the performance was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Diff5Taps(buffer,Out) // mask=[7 18 0 -18 -7]/64
+{       
+macro expr Retiming=1;  // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=5; // 2^6=64 but we take 1 decimal bit--> 5.
+macro expr DataWidth=(width(buffer[0])+6);
+
+signed DataWidth Register[2], aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {   
+        Register[0]=(adjs(buffer[0],DataWidth)-adjs(buffer[4],DataWidth))*7;
+        Register[1]=(adjs(buffer[1],DataWidth)-adjs(buffer[3],DataWidth))*18;
+        //xilinxmult(Register[0], (adjs(buffer[0],DataWidth)-adjs(buffer[4],DataWidth)) ,((int 18) 7) );			 					          
+        //xilinxmult(Register[1], (adjs(buffer[1],DataWidth)-adjs(buffer[3],DataWidth)) ,((int 18) 18) );			 					          
+
+        aux0= Register[0] + Register[1];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        //par(i=1;i<(Retiming);i++)
+        //{
+        //    aux[i]=aux[i-1];
+        //}
+        Out= aux[Retiming-1];
+    }
+
+}
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the derivative 
+%				kernel k = [29 0 -29]/64. The Retiming is not used for the implementation 
+%				because the performance was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Diff3Taps(buffer,Out) // mask=[29 0 -29]/64
+{       
+macro expr Retiming=1;  // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=5; // 2^6=64 but we take 1 decimal bit--> 5.
+macro expr DataWidth=(width(buffer[0])+6);
+
+signed DataWidth Register, aux0;
+//signed (width(Out)) aux[Retiming];
+signed (width(Out)) aux;
+    
+
+    par
+    {   
+        Register=(adjs(buffer[0],DataWidth)-adjs(buffer[2],DataWidth))*29;
+        
+        aux0 = Register;
+        
+        // Rounding
+        if(sign(aux0))
+            aux= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        
+        Out = aux;
+    }
+
+}
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the derivative 
+%				kernel k = [1 4 6 4 1]/16. This is a gaussian low band pass filter.
+%				The Retiming is not used for the implementation because the performance 
+%				was good enough.
+%
+% RETURN
+%   
+*/
+// ***************************************************************************
+macro proc Weighting5(buffer,Out) // mask=[1 4 6 4 1]/16
+{       
+macro expr Retiming=1;  // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=3; // 2^4=16 but the whole derivative range is not used.
+                           // --> one bits more is available
+macro expr DataWidth=(width(buffer[0])+5); //--> 5 is more accurate!!!
+
+signed DataWidth Register[3],aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+        Register[0]=(adjs(buffer[0],DataWidth)+adjs(buffer[4],DataWidth));
+        Register[1]=(adjs(buffer[1],DataWidth)+adjs(buffer[3],DataWidth))<<2;
+        Register[2]=(adjs(buffer[2],DataWidth))*6;
+
+        aux0= Register[0] + Register[1] + Register[2];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+
+        //par(i=1;i<(Retiming);i++)
+        //{
+        //    aux[i]=aux[i-1];
+        //}
+        //Out= aux[Retiming-1];
+        Out= aux[0];
+
+    }
+}
+
+/* 
+%   buffer		- Buffer with the current pixel (center) and the neighborhood
+%	Out			- Output value for the center element
+%
+%	DESCRIPTION
+%				This function computes the filtered pixel (center) using the derivative 
+%				kernel k = [1 2 1]/4. This is a gaussian low band pass filter.
+%				The Retiming is not used for the implementation because the performance 
+%				was good enough.
+%
+% RETURN
+%   
+*/
+macro proc Weighting3(buffer,Out) // mask=[1 2 1]/4
+{       
+macro expr Retiming=1;  // Retiming value = Retiming-1
+macro expr PipeLatency=3+Retiming-1;
+macro expr DivisorShift=1; // 2^2=4 but the whole derivative range is not used.
+                           // --> one bits more is available
+macro expr DataWidth=(width(buffer[0])+2);
+
+signed DataWidth Register[2],aux0;
+signed (width(Out)) aux[Retiming];
+
+    par
+    {
+        Register[0]=(adjs(buffer[0],DataWidth)+adjs(buffer[2],DataWidth));
+        Register[1]=(adjs(buffer[1],DataWidth))<<1;
+
+        aux0= Register[0] + Register[1];
+        // Rounding
+        if(sign(aux0))
+            aux[0]= ((aux0-((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+        else
+            aux[0]= ((aux0+((signed)0@exp2(DivisorShift-1)))>> DivisorShift)<-(width(Out));
+
+        par(i=1;i<(Retiming);i++)
+        {
+            aux[i]=aux[i-1];
+        }
+        Out= aux[Retiming-1];
+
+    }
+}
+
+/*
+%   Input			- Input value for the convolution
+%	Output			- Result of the convolution
+%   KernelX			- Kernel for the X convolution
+%	KernelY			- Kernel for the Y convolution
+%	ColumnLength	- Number of elements of each column
+%
+%	DESCRIPTION
+%				This function computes the separable 2D convolution of the input. 
+%				It stores 4 columns before performing it, with the current column 
+%				they are 5. Then, the convolution is carried out using KernelX for
+%				for the rows and KernelY for the columns.
+%
+% RETURN
+%   
+*/
+macro proc SpatialConvolutions_optf(Input,Output,KernelX,KernelY, ColumnLength)
+{
+    macro expr PipeLatency=6 + 2;
+    macro expr Retiming=1;  // Retiming value = Retiming-1
+ 	
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input))> Read[(MAX_RES_X/SCALE)];      //  Read port
+        wom <signed (width(Input))> Write[(MAX_RES_X/SCALE)];	    //  Write port
+
+    } ColumnsBuffer[4] with {block = "BlockRAM"}; 
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+
+    signed (width(Input)) DataArrayX[5], DataArrayY[5] ;
+    static unsigned (log2ceil(MAX_RES_X/SCALE)) col=1, colbis=0;
+    signed (width(Output)) aux[Retiming];
+    
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+        // Read data into array every cycle             
+    	DataArrayX[4]=Input;
+            
+    	// Shift X data through array
+    	par (i = 0; i != 4; i++)
+    	{
+    	    DataArrayX[i] = DataArrayX[i+1];
+    	}
+        KernelX(DataArrayX,DataArrayY[4]);     
+        
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        
+        // Operations by columns                
+        col= col>=(ColumnLength-1) ? 0 : col+1;
+		colbis= col;
+					
+		// Read data into array every cycle 
+		par(r1=0;r1!=4;r1++)
+		{
+			// Fill data through array
+			DataArrayY[r1] = readRAM(adju(r1,3),col);						
+		}	
+
+		// Shift array and write data into block RAMs every cycle 
+		par(r2=0;r2!=4;r2++)
+		{
+			writeRAM(adju(r2,3),colbis,DataArrayY[r2+1]);						
+		}																							                        
+
+        KernelY(DataArrayY,Output);        
+    } // End Global par
+}
+
+/* 
+%   Input0			- Input derivative (first element for the product)
+%   Input1			- Input derivative (second element for the product)
+%   Output			- Product result of Input0xInput1 previously weighted
+%   ColumnLength	- Number of columns of the input data
+%	
+%	DESCRIPTION
+%				This function computes the weighted product of the derivatives
+%				in Input0 and Input1, using a Weighting5 function. It requires
+%				storing 5 rows (the 4 in the MPRAM plus the current one). Then 
+%				it performs the 2d separable convolution using the same kernel
+%				(Weighting5) for rows and columns
+%
+% RETURN
+%   
+*/
+macro proc WeightingMatrix_optf(Input0, Input1,Output, ColumnLength)
+{
+    macro expr PipeLatency=6 + 2;
+    macro expr Retiming=1;  // Retiming value = Retiming-1
+    macro expr Weigh=Weighting5; // Weighting5 or Weighting3
+ 	
+    // Declare MPRAM and access macros	
+	static mpram 
+	{
+    	rom <signed (width(Input0)*2)> Read[(MAX_RES_X/SCALE)];      //  Read port
+     	wom <signed (width(Input0)*2)> Write[(MAX_RES_X/SCALE)];	    //  Write port
+	} ColumnsBuffer[4] with {block = "BlockRAM"}; // 10, no 4
+
+	macro expr readRAM (row,col) = (ColumnsBuffer[row]).Read[col]; 
+ 	macro proc writeRAM (row,col,data)
+	{
+		(ColumnsBuffer[row]).Write[col]=data;
+	}
+
+    signed (width(Input0)*2) DataArrayX[5], DataArrayY[5] ;     // 11, no 5
+	
+    static unsigned (log2ceil(MAX_RES_X/SCALE)) col=1, colbis=0;
+    signed (width(Output)) aux[Retiming];
+    
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {
+        // Read data into array every cycle   
+        DataArrayX[4]=adjs(Input0,2*width(Input0))*adjs(Input1,2*width(Input1));
+
+        // CASE (A): 5x5, 3x3 weighing function            
+    	// Shift X data through array
+    	par (i = 0; i != 4; i++)       
+    	{
+    	    DataArrayX[i] = DataArrayX[i+1];            
+    	}
+        Weigh(DataArrayX,DataArrayY[4]); 
+
+        /*  ::::::::::::::::::::::::::::::::::::::::::	*/			
+        
+        // Operations by columns                
+        col= col>=(ColumnLength-1) ? 0 : col+1;
+		colbis= col;
+					
+		// Read data into array every cycle 
+		par(r1=0;r1!=4;r1++)             
+		{
+			// Fill data through array
+			DataArrayY[r1] = readRAM(adju(r1,3),col);					
+		}	
+
+		// Shift array and write data into block RAMs every cycle 
+		par(r2=0;r2!=4;r2++)                 
+		{
+			writeRAM(adju(r2,3),colbis,DataArrayY[r2+1]);						
+		}																							                        
+
+        Weigh(DataArrayY,Output); 
+    } // End Global par
+}
+
+/* 
+%   DataIn		- Input value (from the three frames)
+%	st			- Result of the spatial filter convolution
+%   dt			- Result of the spatial derivative convolution
+%	
+%	DESCRIPTION
+%				This function computes the derivative and spatial filtering from 
+%				the three pixel inputs separately.
+%
+% RETURN
+%   
+*/
+macro proc TemporalDerivative_optf(DataIn, dt, st)
+{   
+    par
+    {       
+        // Processing & sending 3 frames
+        Prefilter3Taps(DataIn,st);
+        Diff3Taps(DataIn,dt);
+    }
+}
+
+/* 
+%   FractionalShift		- Number of bits for the precision of the division
+%   detTH				- Energy threshold
+%   Axx					- IxIx*weight
+%   Axy					- IxIy*weight
+%   Ayy					- IyIy*weight
+%   Axt					- IxIt*weight
+%   Ayt					- IyIt*weight
+%   VxOut				- X Optical flow result
+%   VyOut				- Y Optical flow result
+%	
+%	DESCRIPTION
+%				This function solves the system (see paper in main.hcc). To 
+%				perform the division, a divisor is required. We use a CoreGenerator
+%				standard division core to improve the final performance. The number of
+%				bits of the division are set by FractionalShift. As we are always working
+%				with integer, to get float precision, we use shifts for the fractional part.
+%
+% RETURN
+%   
+*/
+macro proc FIXPOINTftu_optf(FractionalShift, detTH, Axx, Axy, Ayy, Axt, Ayt, VxOut, VyOut)
+{    
+    macro expr Retiming=7;  
+    macro expr FRACTBITS=5;
+    macro expr FPSIZE=(2*width(Axx)+1);
+    macro expr PipeLatency=0;
+    macro expr MAX_24b = 16777215; //2^24 - 1
+   
+    // fix-point data registers
+	
+    signed FPSIZE velx, vely, detA, Aux0, Aux1, Aux2, Aux3, Aux4, Aux5;
+    signed DIVIDER_INPUT detAbis, velxbis, velybis;
+    signed DIVIDER_INPUT Vx_big, Vy_big; 
+    signed (width(VxOut)) Vx[Retiming], Vy[Retiming]; 
+    unsigned 1 AbovedetTH[DIVIDER_LATENCY];
+    
+ 
+        
+    //          Macro Begin
+    // ----------------------------------------------------
+    par
+    {        
+	    //Computing the values in the determinant
+        Aux0=adjs(Axy,FPSIZE)*adjs(Ayt,FPSIZE);
+        Aux1=adjs(Ayy,FPSIZE)*adjs(Axt,FPSIZE);
+        Aux2=adjs(Axx,FPSIZE)*adjs(Ayt,FPSIZE);
+        Aux3=adjs(Axy,FPSIZE)*adjs(Axt,FPSIZE);
+        Aux4=adjs(Axx,FPSIZE)*adjs(Ayy,FPSIZE);
+        Aux5=adjs(Axy,FPSIZE)*adjs(Axy,FPSIZE);
+        
+        velx=(Aux0>>4)-(Aux1>>4);
+        vely=(Aux2>>4)-(Aux3>>4);
+        detA=(Aux4>>8)-(Aux5>>8);
+        
+        if((detA > MAX_24b) || (velx > MAX_24b) || (vely > MAX_24b))//Reducing errors
+        par{
+            detAbis = 1; //TH is at least 1
+            velxbis = 1;
+            velybis = 1;
+        }
+        else
+        par{
+            detAbis = adjs(detA[FPSIZE-1]@detA[23:0], width(detAbis));
+            velxbis = adjs(velx[FPSIZE-1]@velx[23:0], width(velxbis));
+            velybis = adjs(vely[FPSIZE-1]@vely[23:0], width(velybis));
+        }
+    
+        // New pipelined division unit
+        par
+        {
+            division_core(velxbis, detAbis, Vx_big); 
+            //Vx_big = velxbis;
+            division_core(velybis, detAbis, Vy_big); 
+            //Vy_big = velybis;
+        }
+        // Control detA > TH
+        AbovedetTH[0]=(detAbis) > ((signed)adju(detTH,DIVIDER_INPUT));
+        
+
+        // delays for threshold and div	(synchronization)
+        par(d=1;d<DIVIDER_LATENCY;d++)
+        {
+            AbovedetTH[d]=AbovedetTH[d-1];
+        }
+        
+        //Energy threshold
+        if (AbovedetTH[DIVIDER_LATENCY-1]!=0)
+        par
+        {
+            Vx[0]=adjs(Vx_big, width(VxOut));
+            Vy[0]=adjs(-Vy_big, width(VyOut));            
+        }
+        else
+        par
+        {
+			//Set to NaN (non valid values)
+            Vx[0]=SetNAN(VxOut);
+            Vy[0]=SetNAN(VyOut);
+        }    
+
+        //Retiming stages (improving final performance)
+        par(k=1;k<Retiming;k++)
+        {
+            Vx[k]=Vx[k-1];
+            Vy[k]=Vy[k-1];
+        }  
+
+        //Writing the outputs
+        VxOut=Vx[Retiming-1];
+        VyOut=Vy[Retiming-1];
+    }             
+}
+
+/* 
+%   Num			- Numerator
+%   Den			- Denominator
+%   Result		- Quotient
+%	
+%	DESCRIPTION
+%				This function computes the division of Num and Den, obtaining the
+%				the quotient that is returned in result. It can be done using the
+%				standard Handel-C implementation, as simply result = Den/Num.
+%				The problem is that the performance is affected by the required 
+%				logic and resources. This is why we are using a core from 
+%				the core Generator. The interface is divider_25 because we are using
+%				25 bits for the division to obtain a better precision.
+%
+% RETURN
+%   
+*/
+macro proc division_core(Num, Den, result)
+{	
+    // Enable for Cores
+    static signal unsigned 1 enable=0;
+    
+    interface divider_25 (signed DIVIDER_INPUT quot, signed DIVIDER_INPUT remd, unsigned 1 rfd) divider(signed  DIVIDER_INPUT dividend = Num, 
+							signed DIVIDER_INPUT divisor = adjs(Den,DIVIDER_INPUT), unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+    
+    par
+    {          
+        //Enabling division Core: only for 1 clock cycle
+        enable=1;        
+        result = divider.quot;        
+    }
+}
--- a/attention/attention_v0.1/lklib.hch
+++ b/attention/attention_v0.1/lklib.hch
+/* lklib.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#ifndef __LKLIB__
+#define __LKLIB__
+
+#include "stdlib.hch"
+#include "generic.hch"
+
+#define XYTDERIVATIVESIZE 9
+#define PIXELSIZE 8
+
+#define DIVIDER_INPUT 25					//Input size of the divider core
+#define DIVIDER_LATENCY DIVIDER_INPUT+4+1 	//Latency of the divider core
+
+
+macro proc Prefilter5Taps(buffer,Out);
+macro proc Prefilter3Taps(buffer,Out);
+
+macro proc Diff5Taps(buffer,Out);
+macro proc Diff3Taps(buffer,Out);
+
+macro proc Weighting(buffer,Out);
+
+macro proc SpatialConvolutions_optf(Input,Output,KernelX,KernelY, ColumnLength);
+
+macro proc WeightingMatrix_optf(Input0, Input1,Output, ColumnLength);
+
+macro proc TemporalDerivative_optf(DataIn, dt, st);
+
+macro proc FIXPOINTftu_optf(FractionalShift, detTH, Axx, Axy, Ayy, Axt, Ayt, VxOut, VyOut);
+
+macro proc division_core(Num, Den, quot);
+
+#endif
\ No newline at end of file
--- a/attention/attention_v0.1/main.hcc
+++ b/attention/attention_v0.1/main.hcc
+/* main.hcc
+%   Pixels		- RGB Input from channel 
+%             		* Pixels[7:0]		- Red color channel
+%             		* Pixels[15:8]		- Green color channel
+%             		* Pixels[23:16]		- Blue color channel
+%   Control		- Control word with the different parameters:
+%             		* Control[60:45]	- Latencies for the feature estimation (gabor modules)
+%             		* Control[44:36]	- Thresholds for the feature estimation 
+%             		* Control[35:26]	- Number of columns of the input images
+%             		* Control[24:21]	- Not used
+%             		* Control[20:13]	- Not used 
+%             		* Control[12:0]		- Latency cycles of the pipeline 
+%
+% RETURN
+%   Output      - Energy, 4 orientation maps, and RG and BY color differences
+%
+% DESCRIPTION
+%   A Handel-C implementation of the idea of 
+%   L. Itti and C. Koch, Computational modelling of visual attention, Nature Review Neuroscience, 
+%	2(3), pp. 194 – 203, 2001.
+%	F. Barranco, J. Diaz, B. Prieto, and E. Ros, Bottom-up visual attention model based on 
+%	FPGA, in Electronics, Circuits and Systems (ICECS), pp. 328 – 331, 2012.
+%
+%   Note that the paper describes most parameters of the algorithm and that it
+%	also describes a whole architecture for a coarse-to-fine estimation the saliency.
+%	This file represents the implementation for the feature maps that combined can allow
+%	the saliency estimation. We also include the normalization operator.
+%
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+//Native Handle C libraries
+#include "stdlib.hch"
+
+//Handle C custom libraries
+#include "cores.hch"
+#include "channels.hch"
+#include "GaborPrimitives.hch"
+#include "generic.hch"
+//#include "opticflow.hch" //I think we do not need it but it is included in the package
+//#include "bilinear_warping_v2.hch" //I think we do not need it but it is included in the package
+
+//Set the clock values here
+//interface port_in (unsigned 1 clk with {clockport = 1}) ClockPort ();  //clk =__clock) ClockPort() ;
+//set clock = internal ClockPort.clk with { rate = 50 };
+
+
+
+/****************************************************************
+* Function  :   main                                            *
+****************************************************************/
+void main(void)
+{	  
+    macro expr adjust=36;
+    macro expr LATENCY_DIFFERENCE = 79-(35-3+1+8); //After removing atan2 cores from primitives (primitives_short)
+    
+    UNSIGNED_CHANNEL Output;
+    UNSIGNED_CHANNEL Input;  
+    
+       
+    unsigned int 24 Pixels;
+    signed int XYTDERIVATIVESIZE Data[3]; //3 color channels
+    signed int F_BITS fe[NORIENTATIONS], fo[NORIENTATIONS];
+    signed int F_BITS fetmp[NORIENTATIONS], fotmp[NORIENTATIONS];   
+    
+  
+    signal <unsigned int 61> Control;
+    signal <unsigned 1> rst;
+    static unsigned int 4 nc=4;
+    unsigned int 10 Columns;
+    unsigned int 13 PipeLatency;
+    signed int 12 Threshold; 
+    unsigned 9 Pr_Threshold;    
+    unsigned 1 end, enable;
+    unsigned int 13 PipeDelay;    
+    unsigned 21 counter;
+    unsigned 21 ImSize;
+    //static unsigned 16 Latencies=22583; //for the gabor modules
+    static signed XYTDERIVATIVESIZE threshold =25;// approx. 1/10 of max(R,G,B)
+    
+    //New variables
+    signed (XYTDERIVATIVESIZE) Data_gray;    
+    signed (XYTDERIVATIVESIZE+6) gray_value;
+    unsigned 9 energy;
+    unsigned int 9 orientation[NORIENTATIONS];
+    signed int CONV_BITS FNYNX[16];
+    
+    
+    signed (XYTDERIVATIVESIZE) SmoothPixel[NFRAMES];
+    signed XYTDERIVATIVESIZE R, G, B; //signed for the subsequent stages 
+    unsigned 1 max_RGB_thd[DIVIDER_LATENCY];
+    signed XYTDERIVATIVESIZE max_RGB, max_RGB_1;
+    signed DIVIDER_INPUT R_1, G_1, B_1, min_RG, den, RG_num, BY_num, RG_pre, BY_pre;
+    signed DIVIDER_INPUT R_2, G_2, B_2, min_RG_1; //relative min
+    signed XYTDERIVATIVESIZE RG, BY, RG_out[LATENCY_DIFFERENCE], BY_out[LATENCY_DIFFERENCE], RG_last, BY_last;
+    	
+    par
+    {
+		//Call interface with Core local features for attention
+        InterfazCore_lf_attention(Input, Output, Control, ImSize);
+        
+		//Running continuously 
+        while(1)       	
+        {            
+            par
+			{
+				enable=0;
+                end=0;				
+				PipeDelay=0;                
+                counter=0;                
+                
+                Latencies = Control[60:45];
+                Pr_Threshold = Control[44:36];
+                Columns = Control[35:26];                
+                nc = Control[24:21]; //Not used
+                Threshold = (Control[20:13]==0) ? 0b011111111111 : ((signed 12) (0@Control[20:13])); //Not used
+                PipeLatency = Control[12:0];                
+			}
+            
+            do
+			{  
+				// All the instruction being executed at the same time: long pipeline
+				// There is an initial latency:
+                par
+                {
+                    enable=1;
+                    
+                    // Reading parameters                        
+                    Latencies = Control[60:45];
+                    Pr_Threshold = Control[44:36];
+                    Columns = Control[35:26];                        
+                    nc = Control[24:21]; //Not used
+                    Threshold = (Control[20:13]==0) ? 0b011111111111 : ((signed 12) (0@Control[20:13])); //Not used
+                    PipeLatency = Control[12:0];  
+                    
+					//Receive data (three pixels, RGB)
+                    //s1
+                    Receive(Input, Pixels);                                                                           
+
+                    //Extracting frame data
+                    //s2
+                    Data[0]= (signed) adju(Pixels[7:0], XYTDERIVATIVESIZE); //R value
+                    Data[1]= (signed) adju(Pixels[15:8], XYTDERIVATIVESIZE);//G value
+                    Data[2]= (signed) adju(Pixels[23:16], XYTDERIVATIVESIZE);//B value     
+                    
+                    par
+                    {
+                        //s3
+                        //Computing the Gray value
+                        //Constants multiplied by 64. Original formula: Gray = R*0.299 + G*0.587 + B*0.114
+                        gray_value = adjs(Data[0], width(gray_value))*19 + adjs(Data[1], width(gray_value))*38 + adjs(Data[2], width(gray_value))*7;
+                        
+                        
+                        //s4
+                        //Adjusting the size
+                        Data_gray = adjs(gray_value\\6, XYTDERIVATIVESIZE);
+
+                        
+                        //s5
+                        //-----------------------------------------------------------------
+                        //Par for the Energy and Orientation features (based on Gabor filters)
+                        //-----------------------------------------------------------------
+                        GaborBase(Data_gray, FNYNX, Columns);
+                            
+                            
+                        BuildGabor(FNYNX, fe, fo);
+                            
+                            
+                        Primitives_short(fe, fo, energy, orientation, Pr_Threshold, Latencies);
+                    }
+                    
+                    
+                    par{
+                        //Spatial convolutions: Using E. Simoncelli derivative and smoothing filters
+                        par(f=0;f<NFRAMES;f++) //NFRAMES == 3
+                        {
+                            //Latency == 2*Columns + 11 
+                            //SpatialConvolutions(((signed)adju(Data[f], XYTDERIVATIVESIZE)), SmoothPixel[f], Prefilter5Taps, Prefilter5Taps, Columns);
+                            SpatialConvolutions_last(Data[f], SmoothPixel[f], Prefilter5Taps, Prefilter5Taps, Columns);
+                        }
+                        
+                        
+                        //Splitting frame data                    
+                        //-----------------------------------------------------------------
+                        R = SmoothPixel[0]; //R value
+                        G = SmoothPixel[1]; //G value
+                        B = SmoothPixel[2]; //B value
+                        
+                        
+                        //Compute maximum and minimum value for yellow and normalization
+                        //Computing relative RGB maximum and relative RG minimum
+                        //-----------------------------------------------------------------
+                        if ( R > G)
+                        par
+                        { 
+                            min_RG = (signed DIVIDER_INPUT)(0@G); //using 18 bits (9 bits for the fractional part for the next division)
+                            if(R > B)
+                            par
+                            {
+                                max_RGB = R;
+                            }
+                            else
+                            par
+                            {
+                                max_RGB = B;
+                            }
+                        }
+                        else
+                        par
+                        {
+                            min_RG = (signed DIVIDER_INPUT)(0@R); //using 18 bits (9 bits for the fractional part for the next division)
+                            if(G > B)
+                            par
+                            {
+                                max_RGB = G;
+                            }
+                            else
+                            par
+                            {
+                                max_RGB = B;
+                            }
+                        }
+                        par
+                        {
+                            R_1 = (signed DIVIDER_INPUT)(0@R);
+                            G_1 = (signed DIVIDER_INPUT)(0@G);
+                            B_1 = (signed DIVIDER_INPUT)(0@B);
+                        }
+                                    
+                        
+                        //adjusting sizes for the division
+                        par
+                        {
+                            R_2 = (signed)(R_1[DIVIDER_INPUT-1]@(R_1[DIVIDER_INPUT-2:0]<<9));
+                            G_2 = (signed)(G_1[DIVIDER_INPUT-1]@(G_1[DIVIDER_INPUT-2:0]<<9));
+                            B_2 = (signed)(B_1[DIVIDER_INPUT-1]@(B_1[DIVIDER_INPUT-2:0]<<9));
+                            min_RG_1 = (signed)(min_RG[DIVIDER_INPUT-1]@(min_RG[DIVIDER_INPUT-2:0]<<9));
+                            max_RGB_1 = max_RGB;
+                        }
+                        
+                        
+                        //Max_RGB inversion for normalization
+                        //-----------------------------------------------------------------
+                        par
+                        {
+                            RG_num = (R_2 - G_2); //DIVIDER_INPUT;  
+                            BY_num = (B_2 - min_RG_1);
+                            
+                            max_RGB_thd[0]=(max_RGB_1 > threshold); //Discard unreliable values: less than 1/10 of max. intensity of the image
+                            den = (signed DIVIDER_INPUT)(0@max_RGB_1);
+                        }
+                        
+                        
+                        //Normalize by the relative maximum
+                        //-----------------------------------------------------------------
+                        par
+                        {
+                            division_core(RG_num, den, RG_pre);//18-bit divider 
+                            division_core(BY_num, den, BY_pre);//18-bit divider 
+                        }
+                        par(d=1;d<DIVIDER_LATENCY;d++)
+                        {
+                            max_RGB_thd[d]=max_RGB_thd[d-1];
+                        }
+                        
+                        
+                        //Discard the unreliable values and compute the RG and BY ones
+                        //RG and BY: s + 1 + 6 = 8 bits (PSize)
+                        //RG and BY: s + 2 + 6 = 8 bits (XYTDERIVATIVESIZE)
+                        //-----------------------------------------------------------------
+                        if (max_RGB_thd[DIVIDER_LATENCY-1]!=0)
+                        par
+                        {
+                            ////RG = adjs(RG_pre\\3, XYTDERIVATIVESIZE);
+                            ////BY = adjs(BY_pre\\3, XYTDERIVATIVESIZE);
+                            
+                            RG = adjs(RG_pre\\2, XYTDERIVATIVESIZE);
+                            BY = adjs(BY_pre\\2, XYTDERIVATIVESIZE);
+                        }
+                        else
+                        par //unreliable values
+                        {
+                            RG=0;
+                            BY=0;
+                            
+                        }
+                        
+                        //Delaying the result (3*Columns + 5)  for synchronization
+                        par{
+                            Delaying(RG, RG_out[0], Columns);
+                            Delaying(BY, BY_out[0], Columns);
+                        }
+                        
+                        //79 is the difference between the latency of the color opponency and the Energy+Orientation computation
+                        par(cnt=1;cnt<LATENCY_DIFFERENCE;cnt++)
+                        {
+                            RG_out[cnt]=RG_out[cnt-1];
+                            BY_out[cnt]=BY_out[cnt-1];
+                        }
+                        
+                        par
+                        {
+                            RG_last = RG_out[LATENCY_DIFFERENCE-1];
+                            BY_last = BY_out[LATENCY_DIFFERENCE-1];
+                        }
+                    }
+                    
+                    
+                    if(PipeDelay==(PipeLatency)) 
+                    par
+                    {  
+                        //Send the output (6 feature maps)
+                        UnsignedSecureSend(Output, ((unsigned)energy)@((unsigned)orientation[0])@((unsigned)orientation[2])@((unsigned)orientation[4])@((unsigned)orientation[6])@((unsigned)RG_last)@((unsigned)BY_last));
+                        //Pass only orientation[0], orientation[2], orientation[4] and orientation[6]: pi, 3*pi/2, 0 and pi/2
+                        
+                        end=(counter==ImSize-1);
+                        counter++;
+                    }
+                    else
+                        PipeDelay++;
+                }
+            }while(!end);
+
+        }   // End while(1)
+    } // end global par
+}
--- a/attention/attention_v0.1/opticflow.hcc
+++ b/attention/attention_v0.1/opticflow.hcc
+#include <stdlib.hch>
+
+#include "opticflow.hch" 
+
+
+macro proc CoreATAN2CORDIC_fl(y, x, enable, angle)
+{
+macro expr CoreWidth	= ATAN2WIDTH;
+macro expr CoreOutputWidth	= ATAN2OUTWIDTH;
+macro expr CoreLatency	= ATAN2LATENCY;
+
+/*	component atan2cordic
+		port (
+		x_in: IN std_logic_VECTOR(20 downto 0);
+		y_in: IN std_logic_VECTOR(20 downto 0);
+		phase_out: OUT std_logic_VECTOR(20 downto 0);
+		clk: IN std_logic);
+	end component; */
+	
+		
+	interface ATAN2NAME (signed CoreOutputWidth phase_out) atan2(signed  CoreWidth x_in=x, 
+							signed CoreWidth y_in=y, unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+	angle=atan2.phase_out;
+}
+
+macro proc CoreDIVIDER(my_dividend, my_divisor, result, enable)
+{
+macro expr DividerWidth	= 21;
+macro expr DividerOutputWidth = 21;
+macro expr DividerLatency	= 0;
+
+		
+	interface DIVIDER_NAME (signed DividerOutputWidth quot, signed DividerOutputWidth remd, unsigned 1 rfd) divider(signed  DividerWidth dividend = my_dividend, 
+							signed DividerWidth divisor = my_divisor, unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+	result=(divider.quot)<-FLOW_BITS;
+}
+macro proc CoreDIVIDER_2(my_dividend, my_divisor, result, enable)
+{
+macro expr DividerWidth	= 27;
+macro expr DividerOutputWidth = 27;
+macro expr DividerLatency	= 0;
+
+		
+	interface DIVIDER_NAME_2 (signed DividerOutputWidth quot, signed DividerOutputWidth remd, unsigned 1 rfd) divider(signed  DividerWidth dividend = my_dividend, 
+							signed DividerWidth divisor = my_divisor, unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+	result=(divider.quot)<-FLOW_BITS;
+}
+
+
+//                  Recursive unsigned vector addition with ballanced tree
+//************************************************************************************
+macro expr UnSumMacro(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom,Extend) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom, adju(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1,Extend) + RecurseAddAux(Array, Middle, Bottom,Extend));
+        in
+            RecurseAddAux(Array, Index, begin,Extend);
+
+/*
+macro expr SumMacro(Array, begin, Index,Extend) =
+    let macro expr RecurseAddAux(Array, Top, Bottom,Extend) = 
+        let macro expr Middle = Bottom + (Top-Bottom)/2; in
+            select (Top == Bottom, adjs(Array[Top],Extend),
+                RecurseAddAux(Array, Top, Middle + 1,Extend) + RecurseAddAux(Array, Middle, Bottom,Extend));
+        in
+            RecurseAddAux(Array, Index, begin,Extend);
+
+
+//***************************************************
+//Macro component_velocity
+//
+//LATENCY = 4;
+//
+//
+//bits format: 	
+//IN:			P[NFRAMES][NORIENT]					--> sign-4-5
+//	
+//OUT:			FVreal[NFRAMES], FVimag[NFRAMES] 	--> sign-14-5
+//				LE[NFRAMES]							--> sign-28-5
+//				
+//***************************************************/
+macro proc component_velocity(P, FVreal, FVimag, LE){
+
+	//***********************************************/
+	//Constant definitions for 3 frames
+	//***********************************************
+	macro expr PSize			= 10;
+	macro expr SXX  			= 14;
+	macro expr SX 				= 6;
+	macro expr DEN 		    	= 6;	
+	const int 3 XX[NFRAMES]		= {1, 2, 3}; //XX3 is XX in the third dimension
+	const int 8 WREAL[NORIENTATIONS]	= {-81, -75, -58, -31, 0, 31, 58, 75};	//	25 * {-F0 * cos(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1} 
+	const int 8 WIMAG[NORIENTATIONS]	= {0, -31, -58, -75, -81, -75, -58, -31};	//	25 * {-F0 * sin(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1}
+
+	//***********************************************
+	//Declarations: Variables
+	//***********************************************
+	int (PSize+2) Sxy[NORIENTATIONS];
+    int (PSize+1) Sxy_0[NORIENTATIONS][NFRAMES]; 
+	int (PSize+1) Sy[NORIENTATIONS];
+    int PSize Sy_0[NORIENTATIONS][NFRAMES];	
+
+	int (PSize+3) a[NORIENTATIONS]; 
+    int (PSize+5) a_0[NORIENTATIONS];
+    int (PSize+5) a_1[NORIENTATIONS];
+    int (PSize+12) a_2[NORIENTATIONS];
+    
+	int (PSize+2) b[NORIENTATIONS];  
+    int (PSize+4) b_0[NORIENTATIONS];
+    int (PSize+4) b_1[NORIENTATIONS];
+    int (PSize+11) b_2[NORIENTATIONS];
+    
+
+
+    int (PSize+3) a3_0[NORIENTATIONS];  
+    
+    int (PSize+2)  bs3[NORIENTATIONS];
+    int (PSize+2)  bs3_1[NORIENTATIONS]; 
+    int (PSize+2)  bs4_0[NORIENTATIONS]; 
+    int (PSize+2)  bs4_1[NORIENTATIONS]; 
+	
+
+
+    int (PSize+3) Reg[NFRAMES][NORIENTATIONS];
+    int (PSize+3) Reg_0[NFRAMES][NORIENTATIONS]; 
+	
+	//Pipeline auxiliary variables
+	int PSize Ps0[NFRAMES][NORIENTATIONS];
+    int PSize Ps1[NFRAMES][NORIENTATIONS];
+	int PSize Ps2[NFRAMES][NORIENTATIONS];
+    int PSize Ps2_1[NFRAMES][NORIENTATIONS];
+    int PSize Ps2_2[NFRAMES][NORIENTATIONS];
+	int PSize Ps3[NFRAMES][NORIENTATIONS];
+    int PSize Ps3_1[NFRAMES][NORIENTATIONS];
+	 
+    
+    int (PSize+3) LE_0[NORIENTATIONS][NFRAMES];
+    int (2*PSize) LE_1[NORIENTATIONS][NFRAMES];
+
+	
+	//***********************************************
+	//Body of the function
+	//***********************************************
+	par(orien=0;orien<NORIENTATIONS;orien++)
+	{
+	
+		//Pipeline Stage 0
+        par(f=0;f<NFRAMES;f++)
+        {
+            Sxy_0[orien][f] = adjs(P[f][orien],width(Sxy_0))*adjs(XX[f],width(Sxy_0));
+            Sy_0[orien][f] = P[f][orien];
+
+            //Copying P for the next stage
+            Ps0[f][orien]=P[f][orien];
+        }
+        //Pipeline Stage 1
+		par
+		{
+			//Sxy[orien] = (adjs(P[0][orien],width(Sxy))*adjs(XX[0],width(Sxy)) + adjs(P[1][orien],width(Sxy))*adjs(XX[1],width(Sxy)) + adjs(P[2][orien],width(Sxy))*adjs(XX[2],width(Sxy)) + adjs(P[3][orien],width(Sxy))*adjs(XX[3],width(Sxy)) + adjs(P[4][orien],width(Sxy))*adjs(XX[4],width(Sxy)));	
+            Sxy[orien] = SumMacro(Sxy_0[orien], 0, NFRAMES-1,width(Sxy));
+			//Sy[orien]  = adjs(P[0][orien],width(Sy)) + adjs(P[1][orien],width(Sy)) + adjs(P[2][orien],width(Sy)) + adjs(P[3][orien],width(Sy)) + adjs(P[4][orien],width(Sy));
+            Sy[orien] = SumMacro(Sy_0[orien], 0, NFRAMES-1,width(Sy));
+
+			//Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps1[f][orien]=Ps0[f][orien];
+			}
+		}
+
+		//Pipeline Stage 2_0
+        par
+        {
+            a_0[orien] = SXX*adjs(Sy[orien],width(a_0));
+            b_0[orien] = NFRAMES*adjs(Sxy[orien],width(b_0));
+                    
+            a_1[orien] = SX*adjs(Sxy[orien],width(a_1));
+            b_1[orien] = SX*adjs(Sy[orien],width(b_1));
+
+            //Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2[f][orien]=Ps1[f][orien];
+			}
+        } 
+        //Pipeline Stage 2_1
+        par
+        {
+            //a_2[orien] = (a_1[orien]-a_2[orien])*5;  // 5 frames
+            //b_2[orien] = (b_1[orien]-b_2[orien])*5;  // multiplied by 5 for following /50 division that become <<8 : 5/256 ~= 1/50
+            a_2[orien] = (adjs(a_0[orien],width(a_2))-adjs(a_1[orien],width(a_2)));    // 3 frames
+            b_2[orien] = (adjs(b_0[orien],width(b_2))-adjs(b_1[orien],width(b_2)));
+
+            //Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2_1[f][orien]=Ps2[f][orien];
+			}
+        } 
+        //Pipeline Stage 2_2
+		par
+		{
+			//Using 5 decimals for a and b (*25)
+			//a[orien] = (adjs(SXX,PSize+15)*32*adjs(Sy[orien],PSize+15) - adjs(SX,PSize+15)*32*adjs(Sxy[orien],PSize+15))/adjs(DEN,PSize+15);
+            //a[orien] = a_2[orien]<<8;  //for 5 frames
+            //a[orien] = a_2[orien]<<3;  //for 3 frames
+			//a[orien] = adjs(((a_2[orien])*21)>>7,width(a));  //for 3 frames
+            a[orien] = adjs((a_2[orien]*43)>>8,width(a));  //for 3 frames
+            
+			//b[orien] = adjs((NFRAMES*32*adjs(Sxy[orien],PSize+13) - adjs(SX,PSize+13)*32*adjs(Sy[orien],PSize+13))/adjs(DEN,PSize+13), width(b));
+            //b[orien] = adjs(b_2[orien]<<8,width(b));  // for 5 frames
+            //b[orien] = adjs(((b_2[orien])*21)>>7,width(b));  // for 3 frames
+            b[orien] = adjs((b_2[orien]*43)>>8,width(b));  // for 3 frames
+            
+			
+			//Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2_2[f][orien]=Ps2_1[f][orien];
+			}
+		}
+
+		//Pipeline Stage 3_0
+        par
+        {
+            par(f=0;f<NFRAMES;f++)
+            {
+                Reg_0[f][orien] = adjs(b[orien],width(Reg_0))*adjs(XX[f],width(Reg_0));
+            
+                //Copying P for the next stage
+    		    Ps3[f][orien]=Ps2_2[f][orien];
+            }
+            //Copying b for the next stage
+			bs3[orien]=b[orien];
+            a3_0[orien] = a[orien];
+        }
+        //Pipeline Stage 3_1
+		par
+		{
+			par(f=0;f<NFRAMES;f++)
+			{
+				//Reg[fr][orien] = adjs(a[orien],width(Reg))+ adjs(b[orien],width(Reg))*adjs(XX[fr],width(Reg));
+                Reg[f][orien] = adjs(a3_0[orien],width(Reg))+ adjs(Reg_0[f][orien],width(Reg));
+			
+				//Copying P for the next stage
+				Ps3_1[f][orien]=Ps3[f][orien];
+			}
+			
+			//Copying b for the next stage
+			bs3_1[orien]=bs3[orien];
+		}
+	
+		//Pipeline Stage 4_0
+        par
+        {
+            par(f=0;f<NFRAMES;f++)
+            {
+                LE_0[orien][f] = adjs(Reg[f][orien],width(LE_0)) - adjs(Ps3_1[f][orien], width(LE_0));
+            }
+            //Copying b for the next stage
+            bs4_0[orien]=bs3_1[orien];
+        }
+        //Pipeline Stage 4_1
+            par
+            {
+            par(f=0;f<NFRAMES;f++)
+            {
+                LE_1[orien][f] = adjs(LE_0[orien][f],width(LE_1))*adjs(LE_0[orien][f],width(LE_1));
+            }
+            //Copying b for the next stage
+            bs4_1[orien]=bs4_0[orien];
+        }
+        //Pipeline Stage 4_2
+		par
+		{
+			//LE[orien] = adjs(((((adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien], 2*PSize+26)*32)*(adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien],2*PSize+26)*32) + (adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32)*(adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32) + (adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32)*(adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32) + (adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32)*(adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32) + (adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32)*(adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32))/NFRAMES)\\15), width(LE));
+            //LE[orien] = adjs(((SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize)*21)>>6)\\2,width(LE));
+            LE[orien] = adjs(((SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize+2)*85)>>8)\\2,width(LE));
+			//LE[orien]=adjs(((LE_1[0][orien]+LE_1[1][orien]+LE_1[2][orien])>>2)\\15,width(LE));
+
+			//LE[orien] = adjs(SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize+26)\\15, width(LE));
+			
+			//Simplifying the equation: FVreal = - (F0*cos(ang)/2*PI)*b[orien] --> FVreal = Wreal[orien]*b[orien] //Wreal is initialised with factor 25
+			//							FVimag = - (F0*sin(ang)/2*PI)*b[orien] --> FVimag = Wimag[orien]*b[orien] //Wreal is initialised with factor 25
+			
+			//FVreal[orien]= (adjs(bs4_1[orien],PSize+8)*adjs(WREAL[orien],PSize+8))\\6; //final size of FVreal is PSize+18
+			//FVimag[orien]= (adjs(bs4_1[orien],PSize+8)*adjs(WIMAG[orien],PSize+8))\\6; //final size of FVimag is PSize+18
+			//FVreal[orien]= ((adjs(bs4_1[orien],PSize+8)*adjs(WREAL[orien],PSize+8))\\2)<-width(FVreal); //final size of FVreal is PSize+18
+			//FVimag[orien]= ((adjs(bs4_1[orien],PSize+8)*adjs(WIMAG[orien],PSize+8))\\2)<-width(FVimag); //final size of FVimag is PSize+18
+            FVreal[orien]= ((adjs(bs4_1[orien],PSize+10)*adjs(WREAL[orien],PSize+10))\\4)<-width(FVreal); //final size of FVreal is PSize+18
+			FVimag[orien]= ((adjs(bs4_1[orien],PSize+10)*adjs(WIMAG[orien],PSize+10))\\4)<-width(FVimag); //final size of FVimag is PSize+18
+		}
+	}
+}
+//***************************************************
+//Macro component_velocity
+//
+//LATENCY = 4;
+//
+//
+//bits format: 	
+//IN:			P[NFRAMES][NORIENT]					--> sign-4-5
+//	
+//OUT:			FVreal[NFRAMES], FVimag[NFRAMES] 	--> sign-14-5
+//				LE[NFRAMES]							--> sign-28-5
+//				
+//***************************************************
+macro proc component_velocity_mia(P, FVreal, FVimag, LE){
+
+	
+	//***********************************************
+	//Constant definitions 
+	//***********************************************
+/*	macro expr PSize			= 10;
+	const int 7 SXX 			= 55;
+	const int 5 SX 				= 15;
+	const int 7 DEN 			= 50;
+	macro expr NORIENT  		= 8;
+	macro expr NFRAMES  		= 5;
+	const int 4 XX[NFRAMES]		= {1, 2, 3, 4, 5}; //XX3 is XX in the third dimension
+	const int 6 WREAL[NORIENT]	= {-20, -19, -14, -8, 0, 8, 14, 19};	//	2^5 * {-F0 * cos(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1} 
+	const int 6 WIMAG[NORIENT]	= {0, -8, -14, -19, -20, -19, -14, -8};	//	2^5 * {-F0 * sin(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1}
+*/	
+
+
+
+	//***********************************************/
+	//Constant definitions for 3 frames
+	//***********************************************
+	macro expr PSize			= 10;
+	const int 7 SXX 			= 14;
+	const int 5 SX 				= 6;
+	const int 7 DEN 			= 6;	
+	const int 4 XX[NFRAMES]		= {1, 2, 3}; //XX3 is XX in the third dimension
+	const int 6 WREAL[NORIENTATIONS]	= {-20, -19, -14, -8, 0, 8, 14, 19};	//	2^5 * {-F0 * cos(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1} 
+	const int 6 WIMAG[NORIENTATIONS]	= {0, -8, -14, -19, -20, -19, -14, -8};	//	2^5 * {-F0 * sin(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1}
+
+	//***********************************************
+	
+	//***********************************************
+	//Declarations: Variables
+	//***********************************************
+	//unsigned int 3 orien;
+	//unsigned int 3 fr;
+
+	//R 
+	int (PSize+3) Sxy[NORIENTATIONS]; 
+	int (PSize+4) Sy[NORIENTATIONS];	
+	int (PSize+15) a[NORIENTATIONS];  //(PSize+10)Q5
+	int (PSize+13) b[NORIENTATIONS];  //(PSize+8)Q5
+	int (PSize+15) Reg[NFRAMES][NORIENTATIONS]; //(PSize+11)Q5 - 5 --> because a, b are splited by DEN (==50)
+	//\R
+
+	//Pipeline auxiliary variables
+	int PSize Ps1[NFRAMES][NORIENTATIONS];
+	int PSize Ps2[NFRAMES][NORIENTATIONS];
+	int PSize Ps3[NFRAMES][NORIENTATIONS];
+	int (PSize+13)  bs3[NORIENTATIONS]; 
+
+	
+	//***********************************************
+	//Body of the function
+	//***********************************************
+	par(orien=0;orien<NORIENTATIONS;orien++)
+	{
+	
+		//Pipeline Stage 1
+		par
+		{
+			Sxy[orien] = adjs(P[0][orien],width(Sxy))*adjs(XX[0],width(Sxy)) + adjs(P[1][orien],width(Sxy))*adjs(XX[1],width(Sxy)) + adjs(P[2][orien],width(Sxy))*adjs(XX[2],width(Sxy)); //+ adjs(P[3][orien],width(Sxy))*adjs(XX[3],width(Sxy)) + adjs(P[4][orien],width(Sxy))*adjs(XX[4],width(Sxy)));	
+			Sy[orien]  = adjs(P[0][orien],width(Sy)) + adjs(P[1][orien],width(Sy)) + adjs(P[2][orien],width(Sy));// + adjs(P[3][orien],width(Sy)) + adjs(P[4][orien],width(Sy));
+
+			//Copying P for the next stage
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Ps1[fr][orien]=P[fr][orien];
+			}
+		}
+
+		//Pipeline Stage 2 
+		par
+		{
+			//Using 5 decimals for a and b (*2^5)
+			a[orien] = (adjs(SXX,PSize+15)*32*adjs(Sy[orien],PSize+15) - adjs(SX,PSize+15)*32*adjs(Sxy[orien],PSize+15))/adjs(DEN,PSize+15);
+			b[orien] = adjs((NFRAMES*32*adjs(Sxy[orien],PSize+13) - adjs(SX,PSize+13)*32*adjs(Sy[orien],PSize+13))/adjs(DEN,PSize+13), width(b));
+			
+			//Copying P for the next stage
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Ps2[fr][orien]=Ps1[fr][orien];
+			}
+		}
+
+		//Pipeline Stage 3
+		par
+		{
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Reg[fr][orien] = adjs(a[orien],width(Reg))+ adjs(b[orien],width(Reg))*adjs(XX[fr],width(Reg));
+			
+				//Copying P for the next stage
+				Ps3[fr][orien]=Ps2[fr][orien];
+			}
+			
+			//Copying b for the next stage
+			bs3[orien]=b[orien];
+		}
+	
+		//Pipeline Stage 4
+		par
+		{
+			LE[orien] = adjs(((((adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien], 2*PSize+26)*32)*(adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien],2*PSize+26)*32) + (adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32)*(adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32) + (adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32)*(adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32))/NFRAMES)\\15), width(LE)); //+ (adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32)*(adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32) + (adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32)*(adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32))/NFRAMES)\\15), width(LE));
+			
+			//Simplifying the equation: FVreal = - (F0*cos(ang)/2*PI)*b[orien] --> FVreal = Wreal[orien]*b[orien] //Wreal is initialised with factor 2^5
+			//							FVimag = - (F0*sin(ang)/2*PI)*b[orien] --> FVimag = Wimag[orien]*b[orien] //Wreal is initialised with factor 2^5
+			
+			FVreal[orien]= (adjs(bs3[orien],PSize+20)*adjs(WREAL[orien],PSize+20))\\10; //final size of FVreal is PSize+18
+			FVimag[orien]= (adjs(bs3[orien],PSize+20)*adjs(WIMAG[orien],PSize+20))\\10; //final size of FVimag is PSize+18
+		}
+	}
+}
+
+
+
+
+
+
+//***************************************************
+//Macro compute_phase
+//
+//LATENCY = 14;
+//
+//bits format: 	
+//IN:			Greal, Gimag 	--> sign-8-1
+//
+//OUT:			P 				--> sign-2-6
+//
+//***************************************************
+macro proc compute_phase(Greal, Gimag, P){
+
+	macro expr PipeLatency=ATAN2LATENCY;
+
+    unsigned int 5 PipeDelay;
+	signed 10 aux[NFRAMES][NORIENTATIONS];  //, auxGimag[NFRAMES][NORIENTATIONS], auxGreal[NFRAMES][NORIENTATIONS];	
+    static signal unsigned 1 enable=0;
+
+	/*/Interface definition
+	interface atan(int 9 phase_out)
+	myatan(int 10 x_in=a, int 10 y_in=b, unsigned 1 clk= __clock) with {busformat="BI"};  */
+
+
+    par(orien=0; orien<NORIENTATIONS;orien++)
+	{
+		//Enabling atan2 Core: only for 1 clock cycle
+        enable=1;
+        
+        //atan2 - core generator: 
+		//Inputs have to be in [-1, 1]
+		//Outputs are in [-PI, PI]
+		par(fr=0;fr<NFRAMES;fr++)
+		{
+			/*if(abs(Gimag[orien][fr])> abs(Greal[orien][fr])){
+				par
+				{
+					//Pipeline Stage 1
+					a=Gimag[orien][fr]/(abs(Gimag[orien][fr])+1);
+					b=Greal[orien][fr]/(abs(Gimag[orien][fr])+1);
+				
+					//Pipeline Stage 2					
+                    P[orien][fr]=myatan.phase_out; //Latency = 13 
+				}
+			}
+			else{
+				par
+				{
+					//Pipeline Stage 1
+					a=Gimag[orien][fr]/(abs(Greal[orien][fr])+1);
+					b=Greal[orien][fr]/(abs(Greal[orien][fr])+1);
+
+					//Pipeline Stage 2
+					P[orien][fr]=myatan.phase_out; //Latency = 13
+				}
+
+			} */ 
+                        
+            /*/ to remove undet values of atan2(0,0)
+            if(Gimag[fr][orien]==0 && Greal[fr][orien]==0)
+            par
+            {
+                auxGimag[fr][orien]=0;
+                auxGreal[fr][orien]=511;
+            }   
+            else
+            par
+            {
+                auxGimag[fr][orien]=Gimag[fr][orien];
+                auxGreal[fr][orien]=Greal[fr][orien];
+            } */
+            
+            CoreATAN2CORDIC_fl(Gimag[fr][orien], Greal[fr][orien], enable, aux[fr][orien]);
+
+            if(PipeDelay==PipeLatency)                 
+                P[fr][orien] = (aux[fr][orien])\\1;                
+            else
+                PipeDelay++;  
+		}
+
+	}
+        
+
+}
+macro proc compute_phase_top(Greal, Gimag, P, index)
+{
+    signed F_BITS auxGreal[NORIENTATIONS], auxGimag[NORIENTATIONS];
+    signed 9 P_Tmp[NORIENTATIONS];
+    
+    par
+    {
+        seq
+        {
+            seq(i=0; i<NFRAMES-1; i++)
+            {
+                delay;
+            }
+            par(s=0;s<NORIENTATIONS;s++)
+            {
+                auxGreal[s]=Greal[s];
+                auxGimag[s]=Gimag[s];
+            }
+        }
+        seq
+        {
+            ifselect(index!=0)
+            {             
+                seq(t=0; t<index; t++) 
+                {
+                    delay;
+                }
+            }
+           
+            function_compute_phase(auxGreal,auxGimag, P_Tmp);
+            
+            ifselect(index!=NFRAMES-1)
+            {             
+                seq(k=index; k<NFRAMES-1; k++)
+                {
+                    delay;
+                }
+            }
+        }//seq
+        
+        par(o=0;o<NORIENTATIONS;o++)
+        {
+            P[o]=P_Tmp[o];
+        }
+        
+    } // par
+    
+}
+
+void function_compute_phase(signed int F_BITS (*Greal),signed int F_BITS (*Gimag), signed int 9 *P)
+{
+    compute_phase_index(Greal,Gimag,P);
+}
+
+//***************************************************
+//Macro compute_phase
+//
+//LATENCY = 14;
+//
+//bits format: 	
+//IN:			Greal, Gimag 	--> sign-8-1
+//
+//OUT:			P 				--> sign-2-6
+//
+//***************************************************
+macro proc compute_phase_index(Greal, Gimag, P){
+
+	macro expr PipeLatency=ATAN2LATENCY;
+
+    unsigned int 5 PipeDelay;
+	signed 10 aux[NORIENTATIONS];  //, auxGimag[NFRAMES][NORIENTATIONS], auxGreal[NFRAMES][NORIENTATIONS];	
+    static signal unsigned 1 enable=0;
+
+    par(orien=0; orien<NORIENTATIONS;orien++)
+	{
+		//Enabling atan2 Core: only for 1 clock cycle
+        enable=1;
+        
+        CoreATAN2CORDIC_fl(Gimag[orien], Greal[orien], enable, aux[orien]);
+
+        if(PipeDelay==PipeLatency)                 
+            P[orien] = (aux[orien])\\1;                
+        else
+            PipeDelay++;  
+	}//par
+}
+//***************************************************
+//Macro compute_single_phase
+//
+//LATENCY = 14;
+//
+//bits format: 	
+//IN:			Greal, Gimag 	--> sign-8-1
+//
+//OUT:			P 				--> sign-2-6
+//
+//***************************************************
+macro proc compute_single_phase(Greal, Gimag, P)
+{
+    macro expr PipeLatency=ATAN2LATENCY;
+
+    unsigned int 5 PipeDelay;
+	signed 10 aux[NORIENTATIONS];  //, auxGimag[NFRAMES][NORIENTATIONS], auxGreal[NFRAMES][NORIENTATIONS];	
+    static signal unsigned 1 enable=0;
+
+	/*/Interface definition
+	interface atan(int 9 phase_out)
+	myatan(int 10 x_in=a, int 10 y_in=b, unsigned 1 clk= __clock) with {busformat="BI"};  */
+
+
+    par(orien=0; orien<NORIENTATIONS;orien++)
+	{
+		//Enabling atan2 Core: only for 1 clock cycle
+        enable=1;
+        
+        //atan2 - core generator: 
+		//Inputs have to be in [-1, 1]
+		//Outputs are in [-PI, PI]
+		  
+        CoreATAN2CORDIC_fl(Gimag[orien], Greal[orien], enable, aux[orien]);
+
+        if(PipeDelay==PipeLatency)                 
+            P[orien] = (aux[orien])\\1;                
+        else
+            PipeDelay++; 
+	}
+}
+
+
+//***************************************************
+//Macro unwrap
+//
+//LATENCY = 12;
+//
+//bits format: 	
+//IN:			Pin [NORIENT][NFRAMES]	--> sign-2-6
+//
+//OUT:			Pout[NORIENT][NFRAMES]	--> sign-4-5
+//
+//***************************************************
+macro proc unwrap(Pin, Pout){
+
+	//***********************************************
+	//Constant definitions 
+	//***********************************************
+	macro expr DOUBLE_PI  		= 402;
+	//macro expr PI  				= 201;
+	macro expr PSize			= 10;
+	macro expr NORIENT  		= 8;
+	//macro expr NFRAMES  		= 5;
+	
+	
+	
+	//***********************************************
+	//Declarations: Variables
+	//***********************************************
+	unsigned 3 fr;
+	unsigned 3 orien;
+	//static unsigned int 3 cur_frame= 1;
+	//unsigned int 3 cur_frame;
+	unsigned int 1 A[NORIENT];
+	int PSize D[NORIENT];
+	unsigned int 3 cf;
+
+
+	int (PSize+1) Pin_2[NFRAMES][NORIENT];
+	int (PSize+2) Pin_3[NFRAMES][NORIENT];
+	int (PSize+3) Pin_4[NFRAMES][NORIENT];
+	int (PSize+2) D_2[NORIENT];
+	int (PSize+3) D_3[NORIENT];
+	int (PSize+4) D_4[NORIENT];
+	unsigned int 1 A_2[NORIENT];
+	unsigned int 1 A_3[NORIENT];
+	unsigned int 1 A_4[NORIENT];
+
+
+	//Pipeline auxiliary variable declarations
+	int PSize Ds2[NORIENT];
+	int (PSize+2) D_2s5[NORIENT];
+	int (PSize+3) D_3s8[NORIENT];
+	int (PSize+4) D_4s11[NORIENT];
+	
+	int (PSize-1) Pins1[NFRAMES][NORIENT];
+	int (PSize-1) Pins2[NFRAMES][NORIENT];
+
+	int (PSize+1) Pin_2s4[NFRAMES][NORIENT];
+	int (PSize+1) Pin_2s5[NFRAMES][NORIENT];
+	
+	int (PSize+2) Pin_3s7[NFRAMES][NORIENT];
+	int (PSize+2) Pin_3s8[NFRAMES][NORIENT];
+	
+	int (PSize+3) Pin_4s10[NFRAMES][NORIENT];
+	int (PSize+3) Pin_4s11[NFRAMES][NORIENT];
+
+
+
+	int PSize Pouts1[NORIENT];
+	int PSize Pouts2[NORIENT];
+	int PSize Pouts3[NORIENT];
+	int PSize Pouts4[NORIENT];
+	int PSize Pouts5[NORIENT];
+	int PSize Pouts6[NORIENT];
+	int PSize Pouts7[NORIENT];
+	int PSize Pouts8[NORIENT];
+	int PSize Pouts9[NORIENT];
+	int PSize Pouts10[NORIENT];
+	int PSize Pouts11[NORIENT];
+
+	int PSize Pout_1s4[NORIENT];
+	int PSize Pout_1s5[NORIENT];
+	int PSize Pout_1s6[NORIENT];
+	int PSize Pout_1s7[NORIENT];
+	int PSize Pout_1s8[NORIENT];
+	int PSize Pout_1s9[NORIENT];
+	int PSize Pout_1s10[NORIENT];
+	int PSize Pout_1s11[NORIENT];
+	
+	int PSize Pout_2s7[NORIENT];
+	int PSize Pout_2s8[NORIENT];
+	int PSize Pout_2s9[NORIENT];
+	int PSize Pout_2s10[NORIENT];
+	int PSize Pout_2s11[NORIENT];
+
+	int PSize Pout_3s10[NORIENT];
+	int PSize Pout_3s11[NORIENT];
+
+
+
+	//Initialisations
+	//cur_frame=1;
+	
+
+	//***********************************************
+	//Body of the function
+	//***********************************************
+	par(orien=0;orien<NORIENT;orien++)
+	{
+		
+		//-------------------------------------------
+		//CURRENT FRAME == 1
+		
+		//Pipeline Stage 1
+		par
+		{
+			//Writing Pout[0]
+			Pouts1[orien] = adjs((Pin[0][orien])\\1, width(Pout));
+
+			D[orien] = adjs(Pin[1][orien],width(D)) - adjs(Pin[0][orien],width(D));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pins1[fr][orien]=Pin[fr][orien];
+			}
+			/////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 2
+		par
+		{
+			A[orien] = abs(D[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pins2[fr][orien]=Pins1[fr][orien];
+				
+			}
+			Ds2[orien]=D[orien];
+			
+			//Writing Pout[0]
+			Pouts2[orien] = Pouts1[orien];
+
+			////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 3
+		par
+		{
+			par(cf=1; cf<NFRAMES;cf++)
+			{
+				Pin_2[cf][orien]=adjs(Pins2[cf][orien],width(Pin_2)) - DOUBLE_PI*(adjs(sign(Ds2[orien]),width(Pin_2))*2+1) * (signed)adju(A[orien],width(Pin_2));
+			}
+
+			//cur_frame=cur_frame+1;
+			
+			//Writing Pout[0]
+			Pouts3[orien] = Pouts2[orien];
+		}
+	
+
+		//-------------------------------------------
+		//CURRENT FRAME == 2
+
+		//Pipeline Stage 4
+		par
+		{
+			//Writing Pout[1]
+			Pout_1s4[orien] = adjs((Pin_2[1][orien])\\1, width(Pout));
+			
+			
+			D_2[orien] = adjs(Pin_2[2][orien],width(D_2)) - adjs(Pin_2[1][orien],width(D_2));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_2s4[fr][orien]=Pin_2[fr][orien];
+			}
+			/////////////////////////////////////////
+			Pouts4[orien]=Pouts3[orien];
+		}
+		
+		//Pipeline Stage 5
+		par
+		{
+			A_2[orien] = abs(D_2[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_2s5[fr][orien]=Pin_2s4[fr][orien];
+			}
+			D_2s5[orien]=D_2[orien];
+			Pout_1s5[orien]=Pout_1s4[orien];
+			Pouts5[orien]=Pouts4[orien];
+			////////////////////////////////////////
+		}
+
+		
+		//Pipeline Stage 6
+		par
+		{
+			par(cf=2; cf<NFRAMES;cf++)
+			{
+				Pin_3[cf][orien]=adjs(Pin_2s5[cf][orien],width(Pin_3)) - DOUBLE_PI*(adjs(sign(D_2s5[orien]),width(Pin_3))*2+1) * (signed)adju(A_2[orien],width(Pin_3));
+			}
+			
+			//cur_frame=cur_frame+1;
+			Pout_1s6[orien]=Pout_1s5[orien];
+			Pouts6[orien]=Pouts5[orien];
+		}
+
+		//-------------------------------------------
+		//CURRENT FRAME == 3
+		
+		//Pipeline Stage 7
+		par
+		{
+			//Writing Pout[2]
+			Pout_2s7[orien] = adjs((Pin_3[2][orien])\\1, width(Pout));
+			
+
+			D_3[orien] = adjs(Pin_3[3][orien],width(D_3)) - adjs(Pin_3[2][orien],width(D_3));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_3s7[fr][orien]=Pin_3[fr][orien];
+			}
+			Pout_1s7[orien]=Pout_1s6[orien];
+			Pouts7[orien]=Pouts6[orien];
+			/////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 8
+		par
+		{
+			A_3[orien] = abs(D_3[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_3s8[fr][orien]=Pin_3s7[fr][orien];
+			}
+			D_3s8[orien]=D_3[orien];
+			Pout_2s8[orien]=Pout_2s7[orien];
+			Pout_1s8[orien]=Pout_1s7[orien];
+			Pouts8[orien]=Pouts7[orien];
+			////////////////////////////////////////
+		}
+
+		
+		//Pipeline Stage 9
+		par
+		{
+			par(cf=3; cf<NFRAMES;cf++)
+			{
+				Pin_4[cf][orien]=adjs(Pin_3s8[cf][orien],width(Pin_4)) - DOUBLE_PI*(adjs(sign(D_3s8[orien]),width(Pin_4))*2+1) * (signed)adju(A_3[orien],width(Pin_4));
+			}
+
+			//cur_frame=cur_frame+1;
+			Pout_2s9[orien]=Pout_2s8[orien];
+			Pout_1s9[orien]=Pout_1s8[orien];
+			Pouts9[orien]=Pouts8[orien];
+		}
+
+
+		//-------------------------------------------
+		//CURRENT FRAME == 4
+		
+		//Pipeline Stage 10
+		par
+		{
+			//Writing Pout[3]
+			Pout_3s10[orien] = adjs((Pin_4[3][orien])\\1, width(Pout));
+			
+			D_4[orien] = adjs(Pin_4[4][orien],width(D_4)) - adjs(Pin_4[3][orien],width(D_4));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_4s10[fr][orien]=Pin_4[fr][orien];
+			}
+			
+			Pout_2s10[orien]=Pout_2s9[orien];
+			Pout_1s10[orien]=Pout_1s9[orien];
+			Pouts10[orien]=Pouts9[orien];
+			/////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 11
+		par
+		{
+			A_4[orien] = abs(D_4[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_4s11[fr][orien]=Pin_4s10[fr][orien];
+			}
+			D_4s11[orien]=D_4[orien];
+
+			Pout_3s11[orien]=Pout_3s10[orien];
+			Pout_2s11[orien]=Pout_2s10[orien];
+			Pout_1s11[orien]=Pout_1s10[orien];
+			Pouts11[orien]=Pouts10[orien];
+			////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 12
+		//Writing Pout[0..4]
+		par
+		{
+			Pout[0][orien]=Pouts11[orien];
+			Pout[1][orien]=Pout_1s11[orien];
+			Pout[2][orien]=Pout_2s11[orien];
+			Pout[3][orien]=Pout_3s11[orien];
+			Pout[4][orien]=adjs((adjs(Pin_4s11[4][orien],width(Pin_4s11)) - DOUBLE_PI*(adjs(sign(D_4s11[orien]),width(Pin_4s11))*2+1)*(signed)adju(A_4[orien],width(Pin_4s11)))\\1, width(Pout));
+		}
+
+	}//par orient
+}
+
+//***************************************************
+//Macro unwrap_3: version for 3 frames
+//
+//LATENCY = 6;
+//
+//bits format: 	
+//IN:			Pin [NORIENT][NFRAMES]	--> sign-2-6
+//
+//OUT:			Pout[NORIENT][NFRAMES]	--> sign-4-5
+//
+//***************************************************
+macro proc unwrap_3(Pin, Pout){
+
+	//***********************************************
+	//Constant definitions 
+	//***********************************************
+	macro expr DOUBLE_PI  		= 402;	
+	//macro expr PI		  		= 201;	
+	macro expr PSize			= 10;
+	
+	
+	
+	//***********************************************
+	//Declarations: Variables
+	//***********************************************
+	unsigned 3 fr;
+	unsigned 3 orien;
+	//static unsigned int 3 cur_frame= 1;
+	//unsigned int 3 cur_frame;
+	unsigned int 1 A[NORIENTATIONS];
+	int PSize D[NORIENTATIONS];
+	unsigned int 3 cf;
+
+
+	int (PSize+1) Pin_2[NFRAMES][NORIENTATIONS];
+	int (PSize+2) Pin_3[NFRAMES][NORIENTATIONS];
+	int (PSize+3) Pin_4[NFRAMES][NORIENTATIONS];
+	int (PSize+2) D_2[NORIENTATIONS];
+	int (PSize+3) D_3[NORIENTATIONS];
+	int (PSize+4) D_4[NORIENTATIONS];
+	unsigned int 1 A_2[NORIENTATIONS], A_3[NORIENTATIONS], A_4[NORIENTATIONS];
+
+
+	//Pipeline auxiliary variable declarations
+	int PSize Ds2[NORIENTATIONS];
+	int (PSize+2) D_2s5[NORIENTATIONS];	
+	int (PSize-1) Pins1[NFRAMES][NORIENTATIONS], Pins2[NFRAMES][NORIENTATIONS];
+	int (PSize+1) Pin_2s4[NFRAMES][NORIENTATIONS], Pin_2s5[NFRAMES][NORIENTATIONS];
+	int PSize Pouts1[NORIENTATIONS], Pouts2[NORIENTATIONS], Pouts3[NORIENTATIONS], Pouts4[NORIENTATIONS], Pouts5[NORIENTATIONS];	
+	int PSize Pout_1s4[NORIENTATIONS], Pout_1s5[NORIENTATIONS];	
+
+	//***********************************************
+	//Body of the function
+	//***********************************************
+	par(orien=0;orien<NORIENTATIONS;orien++)
+	{
+		
+		//-------------------------------------------
+		//CURRENT FRAME == 1
+		
+		//Pipeline Stage 1
+		par
+		{
+			//Writing Pout[0]
+			Pouts1[orien] = adjs((Pin[0][orien])\\1, width(Pout));
+
+			D[orien] = adjs(Pin[1][orien],width(D)) - adjs(Pin[0][orien],width(D));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pins1[fr][orien]=Pin[fr][orien];
+			}
+			/////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 2
+		par
+		{
+			A[orien] = abs(D[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pins2[fr][orien]=Pins1[fr][orien];
+			}
+			Ds2[orien]=D[orien];
+			
+			//Writing Pout[0]
+			Pouts2[orien] = Pouts1[orien];
+
+			////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 3
+		par
+		{
+			par(cf=1; cf<NFRAMES;cf++)
+			{
+				Pin_2[cf][orien]=adjs(Pins2[cf][orien],width(Pin_2)) - DOUBLE_PI*(adjs(sign(Ds2[orien]),width(Pin_2))*2+1) * (signed)adju(A[orien],width(Pin_2));
+			}
+
+			//cur_frame=cur_frame+1;
+			
+			//Writing Pout[0]
+			Pouts3[orien] = Pouts2[orien];
+		}
+	
+
+		//-------------------------------------------
+		//CURRENT FRAME == 2
+
+		//Pipeline Stage 4
+		par
+		{
+			//Writing Pout[1]
+			Pout_1s4[orien] = adjs((Pin_2[1][orien])\\1, width(Pout));
+			
+			
+			D_2[orien] = adjs(Pin_2[2][orien],width(D_2)) - adjs(Pin_2[1][orien],width(D_2));
+		
+			//Copying Pin for the next stage/////////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_2s4[fr][orien]=Pin_2[fr][orien];
+			}
+			/////////////////////////////////////////
+			Pouts4[orien]=Pouts3[orien];
+		}
+		
+		//Pipeline Stage 5
+		par
+		{
+			A_2[orien] = abs(D_2[orien])>(PI);
+			
+		
+			//Copying Pin, D for the next stage/////
+			par(fr=0;fr<NFRAMES;fr++)
+			{
+				Pin_2s5[fr][orien]=Pin_2s4[fr][orien];
+			}
+			D_2s5[orien]=D_2[orien];
+			Pout_1s5[orien]=Pout_1s4[orien];
+			Pouts5[orien]=Pouts4[orien];
+			////////////////////////////////////////
+		}
+		
+		//Pipeline Stage 6
+		par
+		{			
+			//Writing Pout[0..2]		
+			Pout[0][orien]=Pouts5[orien];
+			Pout[1][orien]=Pout_1s5[orien];
+			Pout[2][orien]=adjs((adjs(Pin_2s5[2][orien],width(Pout)) - DOUBLE_PI*(adjs(sign(D_2s5[orien]),width(Pout))*2+1)*(signed)adju(A_2[orien],width(Pout)))\\1, width(Pout));
+		}       		
+				
+	}//par orient
+}
+
+
+//*************************************************************************
+// macro full_velocity
+/*************************************************************************/
+macro proc full_velocity(FVx,FVy,LE,thres,nc_min, enable, Ox, Oy)
+{    
+    macro expr Frac=4;
+    macro expr DIVLATENCY=DIVIDER_LATENCY;  // added 1 for thresholding in invert function
+	macro expr THlat=DIVLATENCY+DIVLATENCY+4;
+	macro expr SUMlat=DIVLATENCY+2;
+
+    unsigned int 4 nc[THlat];
+    unsigned int 1 nc_par[NORIENTATIONS];
+    signed int FLOW_BITS Vx[NORIENTATIONS], Vy[NORIENTATIONS], auxYY[NORIENTATIONS],auxXX[NORIENTATIONS], auxXY[NORIENTATIONS];
+    signed int (FLOW_BITS) sumX[SUMlat],sumY[SUMlat], sumYYL_2, sumXXL_2, sumXYL_2;
+    signed int (DIVIDER_INPUT) NumX[DIVLATENCY], NumY[DIVLATENCY];
+    signed int DIVIDER_INPUT aux_den_0, aux_den_1, aux_NumX_0, aux_NumX_1,aux_NumY_0, aux_NumY_1;
+    unsigned int 1 cond[NORIENTATIONS];
+    signed int (DIVIDER_INPUT) Vxx[NORIENTATIONS][DIVLATENCY+1], Vyy[NORIENTATIONS][DIVLATENCY+1],Vxy[NORIENTATIONS][DIVLATENCY+1];
+    signed int DIVIDER_INPUT SumXX_YY[NORIENTATIONS];
+    signed int DIVIDER_INPUT L2[NORIENTATIONS], den;
+    signed FLOW_BITS quotX, quotY;	    
+
+    //assert (NORIENTATIONS==8, 0, "The code function only for 8 orientations");
+
+    //------------------------------------------------------------
+    //  Verify bitwidth in operations
+    //  improve division (ex. divider core)
+    //------------------------------------------------------------
+
+    par
+    {        
+		/*/  Pipeline 0
+        par(o=0;o<NORIENTATIONS;o++)
+        {
+			cond[o]=(LE[o]<thres);  // && (FVx[o]!=NAN) && (FVy[o]!=NAN); 
+            auxFVx[o]=FVx[o];
+            auxFVy[o]=FVy[o];            
+        } */
+        // Pipeline 1
+        par(o=0;o<NORIENTATIONS;o++)
+        {            
+            //if(cond[o] && (FVxx[o]+FVyy[o])>EPS)
+            if(LE[o]<thres && (FVx[o]!=0 || FVy[o]!=0) )            
+            par
+            {
+                //L2[o]= ((signed)one)/(FVxx[o] + FVyy[o]);
+                SumXX_YY[o]= (adjs(FVx[o],width(SumXX_YY))*adjs(FVx[o],width(SumXX_YY))) + (adjs(FVy[o],width(SumXX_YY))*adjs(FVy[o],width(SumXX_YY))); //FVxx[o] + FVyy[o];
+                Vx[o]=FVx[o];
+                Vy[o]=FVy[o];
+                Vxx[o][0]=(adjs(FVx[o],width(Vxx))*adjs(FVx[o],width(Vxx))); //\\Frac;
+                Vyy[o][0]=(adjs(FVy[o],width(Vyy))*adjs(FVy[o],width(Vyy))); //\\Frac;
+                Vxy[o][0]=(adjs(FVy[o],width(Vxy))*adjs(FVx[o],width(Vxy))); //\\Frac;
+                nc_par[o]=1;
+            }
+            else
+            par
+            {
+                nc_par[o]=0;
+                //L2[o]=0;
+                SumXX_YY[o]=0;
+                Vx[o]=0;
+                Vy[o]=0;
+                Vxx[o][0]=0;
+                Vyy[o][0]=0;
+                Vxy[o][0]=0;
+            }
+        }        
+
+        par(v=1;v<(DIVLATENCY+1);v++)
+        {
+            par(o=0;o<NORIENTATIONS;o++)
+            {
+                Vxx[o][v]=Vxx[o][v-1];
+                Vyy[o][v]=Vyy[o][v-1];
+                Vxy[o][v]=Vxy[o][v-1];
+            }
+        }
+
+		//Pipeline 2
+        par{
+			//  Pipeline 2            
+	        //nc=adju((L2[0]>0),4)+adju((L2[1]>0),4)+adju((L2[2]>0),4)+adju((L2[3]>0),4)+adju((L2[4]>0),4)+adju((L2[5]>0),4)+adju((L2[6]>0),4)+adju((L2[7]>0),4);
+	        nc[0] = UnSumMacro(nc_par, 0, NORIENTATIONS-1, width(nc));
+	        //sumX = Vx[0]+Vx[1]+Vx[2]+Vx[3]+Vx[4]+Vx[5]+Vx[6]+Vx[7];
+	        sumX[0] = SumMacro(Vx, 0, NORIENTATIONS-1, width(sumX));
+	        //sumY = Vy[0]+Vy[1]+Vy[2]+Vy[3]+Vy[4]+Vy[5]+Vy[6]+Vy[7];
+	        sumY[0] = SumMacro(Vy, 0, NORIENTATIONS-1, width(sumY));
+
+            // Pipeline 2
+            par(o=0;o<NORIENTATIONS;o++)
+            {
+                invert(SumXX_YY[o], enable, L2[o]);  
+            }
+
+	        // Pipeline 6
+	        par(O=0;O<NORIENTATIONS;O++)
+	        {
+	            //if(L2[O]!=0)  // && (Vy[O]*Vx[O])!=0)
+	            par
+	            {
+	                
+	                //divide12(Vyy[O], L2[O], auxYY[O]);
+					auxYY[O] = ((Vyy[O][DIVLATENCY]*adjs(L2[O],width(Vyy)))>>(DIVIDER_INPUT-5)) <-FLOW_BITS;	                
+	                //divide12(Vxx[O], L2[O], auxXX[O]);
+					auxXX[O] = ((Vxx[O][DIVLATENCY]*adjs(L2[O],width(Vxx)))>>(DIVIDER_INPUT-5)) <- FLOW_BITS;  
+	                //divide12(Vxy[O], L2[O], auxXY[O]);
+					auxXY[O] = ((Vxy[O][DIVLATENCY]*adjs(L2[O],width(Vxy)))>>(DIVIDER_INPUT-5)) <- FLOW_BITS;	                
+	            }
+	            //else
+	                //NanCond[O]=1;
+	        }
+		}
+
+        //  Pipeline 7        
+        sumYYL_2 = SumMacro(auxYY, 0, NORIENTATIONS-1, width(sumYYL_2));        
+        sumXXL_2 = SumMacro(auxXX, 0, NORIENTATIONS-1, width(sumXXL_2));        
+        sumXYL_2 = SumMacro(auxXY, 0, NORIENTATIONS-1, width(sumXYL_2));
+        
+
+        //  Pipeline 8
+        aux_den_0 = (adjs(sumXYL_2,width(aux_den_0)+4)*adjs(sumXYL_2,width(aux_den_0)+4))\\Frac;
+        aux_den_1 = (adjs(sumXXL_2,width(aux_den_1)+4)*adjs(sumYYL_2,width(aux_den_1)+4))\\Frac;
+        aux_NumX_0 = (adjs(sumX[SUMlat-1],width(aux_NumX_0)+4)*adjs(sumYYL_2,width(aux_NumX_0)+4))\\Frac;
+        aux_NumX_1 = (adjs(sumY[SUMlat-1],width(aux_NumX_1)+4)*adjs(sumXYL_2,width(aux_NumX_1)+4))\\Frac;
+        aux_NumY_0 = (adjs(sumX[SUMlat-1],width(aux_NumY_0)+4)*adjs(sumXYL_2,width(aux_NumY_0)+4))\\Frac;
+        aux_NumY_1 = (adjs(sumY[SUMlat-1],width(aux_NumY_1)+4)*adjs(sumXXL_2,width(aux_NumY_1)+4))\\Frac;
+
+        //  Pipeline 9        
+        //den = ((signed)one)/(aux_den_0 - aux_den_1);
+        invert((aux_den_0 - aux_den_1), enable, den);
+        NumX[0] = -(aux_NumX_0 - aux_NumX_1);
+        NumY[0] = aux_NumY_0 - aux_NumY_1;
+
+        //  Pipeline 10-14
+		//quotX=adjs((NumX<<4)/den, FLOW_BITS);
+        quotX=((NumX[DIVLATENCY-1]*adjs(den,width(NumX)))>>(DIVIDER_INPUT-5)) <- FLOW_BITS;
+		//quotY=adjs((NumY<<4)/den, FLOW_BITS);
+        quotY=((NumY[DIVLATENCY-1]*adjs(den,width(NumY)))>>(DIVIDER_INPUT-5)) <- FLOW_BITS;
+
+        //divide12(NumX, den, quotX);
+        //divide12(NumY, den, quotY);
+
+		// delay for threshold
+        par(i=1; i<THlat; i++)
+		{
+			nc[i]=nc[i-1];
+		}
+        par(n=1; n<DIVLATENCY; n++)
+		{
+			NumX[n]=NumX[n-1];
+            NumY[n]=NumY[n-1];
+		}
+		par(s=1; s<SUMlat; s++)
+		{
+			sumX[s]=sumX[s-1];
+			sumY[s]=sumY[s-1];
+		}
+
+        //  Pipeline 15
+        if (nc[THlat-1]>=nc_min)
+        par
+        {
+            Ox= quotX;    //       den = (sumXYL_22-sumXXL_2*sumYYL_2)
+            //Ox= (-NumX>>(lmo(den<-(width(den)-1))))<-FLOW_BITS;
+            //Ox= (-NumX);
+            Oy= quotY;
+            //Oy= ( NumY>>(lmo(den<-(width(den)-1))))<-FLOW_BITS;
+            //Oy= (NumY);
+        }
+        else
+        par
+        {
+            Ox=SetNAN(Ox);   //in matlab is NaN
+            Oy=SetNAN(Oy);   //in matlab is NaN
+        }
+
+    }
+
+
+}
+
+
+//*************************************************************************
+// macro full_velocity_small
+/*************************************************************************/
+macro proc full_velocity_small(FVx,FVy,LE,thres, Div_thr,nc_min, Ox, Oy)
+{    
+    macro expr Frac=8;
+    macro expr DIVLATENCY=DIVIDER_LATENCY+1;  // added 1 for thresholding in invert function
+	macro expr THlat=DIVLATENCY+DIVLATENCY+4;
+	macro expr SUMlat=DIVLATENCY+2;    
+
+    unsigned int 4 nc[THlat];
+    unsigned int 1 nc_par[NORIENTATIONS];
+    signed int FLOW_BITS Vx[NORIENTATIONS], Vy[NORIENTATIONS];
+    signed int DIVIDER_INPUT auxYY[NORIENTATIONS], auxXX[NORIENTATIONS], auxXY[NORIENTATIONS];
+    signed int (FLOW_BITS+3) sumX[SUMlat],sumY[SUMlat];
+    signed int (DIVIDER_INPUT) sumYYL_2, sumXXL_2, sumXYL_2;
+    signed int (DIVIDER_INPUT) NumX[DIVLATENCY], NumY[DIVLATENCY];
+    signed int (DIVIDER_INPUT) aux_den_0, aux_den_1, aux_NumX_0, aux_NumX_1,aux_NumY_0, aux_NumY_1;
+    unsigned int 1 cond[NORIENTATIONS];
+    signed int (DIVIDER_INPUT) Vxx[NORIENTATIONS][DIVLATENCY+1], Vyy[NORIENTATIONS][DIVLATENCY+1],Vxy[NORIENTATIONS][DIVLATENCY+1];
+    signed int DIVIDER_INPUT SumXX_YY[NORIENTATIONS];
+    signed int DIVIDER_INPUT L2[NORIENTATIONS], den;
+    signed FLOW_BITS quotX, quotY;	
+
+    //assert (NORIENTATIONS==8, 0, "The code function only for 8 orientations");
+
+    //------------------------------------------------------------
+    //  Verify bitwidth in operations
+    //  improve division (ex. divider core)
+    //------------------------------------------------------------
+
+    par
+    {        
+		/*/  Pipeline 0
+        par(o=0;o<NORIENTATIONS;o++)
+        {
+			cond[o]=(LE[o]<thres);  // && (FVx[o]!=NAN) && (FVy[o]!=NAN); 
+            auxFVx[o]=FVx[o];
+            auxFVy[o]=FVy[o];            
+        } */
+        // Pipeline 1
+        par(o=0;o<NORIENTATIONS;o++)
+        {            
+            //if(cond[o] && (FVxx[o]+FVyy[o])>EPS)
+            if(LE[o]<thres && ((FVx[o]!=0) || FVy[o]!=0) && (FVx[o]!=NAN) && (FVy[o]!=NAN))  //((FVx[o]*FVx[o])+(FVy[o]*FVy[o]))>EPS)//(FVx[o]!=0 || FVy[o]!=0) )            
+            par
+            {
+                //L2[o]= ((signed)one)/(FVxx[o] + FVyy[o]);
+                SumXX_YY[o]= ( (adjs(FVx[o],width(SumXX_YY)+4)*adjs(FVx[o],width(SumXX_YY)+4)) + (adjs(FVy[o],width(SumXX_YY)+4)*adjs(FVy[o],width(SumXX_YY)+4)))\\4 ; //FVxx[o] + FVyy[o];
+                Vx[o]=FVx[o];
+                Vy[o]=FVy[o];
+                Vxx[o][0]=(adjs(FVx[o],width(Vxx)+4)*adjs(FVx[o],width(Vxx)+4))\\4;
+                Vyy[o][0]=(adjs(FVy[o],width(Vyy)+4)*adjs(FVy[o],width(Vyy)+4))\\4;
+                Vxy[o][0]=(adjs(FVy[o],width(Vxy)+4)*adjs(FVx[o],width(Vxy)+4))\\4;
+                nc_par[o]=1;
+            }
+            else
+            par
+            {
+                nc_par[o]=0;
+                //L2[o]=0;
+                SumXX_YY[o]=0;
+                Vx[o]=0;
+                Vy[o]=0;
+                Vxx[o][0]=0;
+                Vyy[o][0]=0;
+                Vxy[o][0]=0;
+            }
+        }        
+
+        par(v=1;v<(DIVLATENCY+1);v++)
+        {
+            par(o=0;o<NORIENTATIONS;o++)
+            {
+                Vxx[o][v]=Vxx[o][v-1];
+                Vyy[o][v]=Vyy[o][v-1];
+                Vxy[o][v]=Vxy[o][v-1];
+            }
+        }
+
+		//Pipeline 2
+        par{
+			//  Pipeline 2            
+	        //nc=adju((L2[0]>0),4)+adju((L2[1]>0),4)+adju((L2[2]>0),4)+adju((L2[3]>0),4)+adju((L2[4]>0),4)+adju((L2[5]>0),4)+adju((L2[6]>0),4)+adju((L2[7]>0),4);
+	        nc[0] = UnSumMacro(nc_par, 0, NORIENTATIONS-1, width(nc));
+	        //sumX = Vx[0]+Vx[1]+Vx[2]+Vx[3]+Vx[4]+Vx[5]+Vx[6]+Vx[7];
+	        sumX[0] = SumMacro(Vx, 0, NORIENTATIONS-1, width(sumX));
+	        //sumY = Vy[0]+Vy[1]+Vy[2]+Vy[3]+Vy[4]+Vy[5]+Vy[6]+Vy[7];
+	        sumY[0] = SumMacro(Vy, 0, NORIENTATIONS-1, width(sumY));
+
+            // Pipeline 2
+            par(o=0;o<NORIENTATIONS;o++)
+            {
+                invert(SumXX_YY[o], Div_thr, L2[o]);  
+            }
+
+	        // Pipeline 6
+	        par(O=0;O<NORIENTATIONS;O++)
+	        {
+	            //if(L2[O]!=0)  // && (Vy[O]*Vx[O])!=0)
+	            par
+	            {   
+					//auxYY[O] = ((adjs(Vyy[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-5)) <-DIVIDER_INPUT;	                	                
+                    auxYY[O] = ((adjs(Vyy[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-9)) <-DIVIDER_INPUT;
+					//auxXX[O] = ((adjs(Vxx[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-5)) <- DIVIDER_INPUT;  	                
+                    auxXX[O] = ((adjs(Vxx[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-9)) <- DIVIDER_INPUT;
+					//auxXY[O] = ((adjs(Vxy[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-5)) <- DIVIDER_INPUT;	                
+                    auxXY[O] = ((adjs(Vxy[O][DIVLATENCY],26)*adjs(L2[O],26))>>(DIVIDER_INPUT-9)) <- DIVIDER_INPUT;
+	            }
+	            //else
+	                //NanCond[O]=1;
+	        }
+		}
+
+        //  Pipeline 7        
+        sumYYL_2 = SumMacro(auxYY, 0, NORIENTATIONS-1, width(sumYYL_2));        
+        sumXXL_2 = SumMacro(auxXX, 0, NORIENTATIONS-1, width(sumXXL_2));        
+        sumXYL_2 = SumMacro(auxXY, 0, NORIENTATIONS-1, width(sumXYL_2));
+        
+
+        //  Pipeline 8
+        //aux_den_0 = (adjs(sumXYL_2,width(aux_den_0)+4)*adjs(sumXYL_2,width(aux_den_0)+4))\\4;
+        aux_den_0 = (adjs(sumXYL_2,width(aux_den_0)+Frac)*adjs(sumXYL_2,width(aux_den_0)+Frac))\\Frac;
+        //aux_den_1 = (adjs(sumXXL_2,width(aux_den_1)+4)*adjs(sumYYL_2,width(aux_den_1)+4))\\4;
+        aux_den_1 = (adjs(sumXXL_2,width(aux_den_1)+Frac)*adjs(sumYYL_2,width(aux_den_1)+Frac))\\Frac;
+        aux_NumX_0 = (adjs(sumX[SUMlat-1],width(aux_NumX_0)+4)*adjs(sumYYL_2,width(aux_NumX_0)+4))\\4;
+        aux_NumX_1 = (adjs(sumY[SUMlat-1],width(aux_NumX_1)+4)*adjs(sumXYL_2,width(aux_NumX_1)+4))\\4;
+        aux_NumY_0 = (adjs(sumX[SUMlat-1],width(aux_NumY_0)+4)*adjs(sumXYL_2,width(aux_NumY_0)+4))\\4;
+        aux_NumY_1 = (adjs(sumY[SUMlat-1],width(aux_NumY_1)+4)*adjs(sumXXL_2,width(aux_NumY_1)+4))\\4;
+
+        //  Pipeline 9        
+        //den = ((signed)one)/(aux_den_0 - aux_den_1);
+        invert((aux_den_0 - aux_den_1), Div_thr, den);
+        NumX[0] = -(aux_NumX_0 - aux_NumX_1);
+        NumY[0] = aux_NumY_0 - aux_NumY_1;        
+
+        //  Pipeline 10-14
+		//quotX=adjs((NumX<<4)/den, FLOW_BITS);
+        if(den!=0)
+        par
+        {
+            quotX=((adjs(NumX[DIVLATENCY-1],26)*adjs(den,26))>>(DIVIDER_INPUT-1)) <- FLOW_BITS;
+            quotY=((adjs(NumY[DIVLATENCY-1],26)*adjs(den,26))>>(DIVIDER_INPUT-1)) <- FLOW_BITS;
+        }
+        else
+        par
+        {
+            quotX=SetNAN(quotX);
+            quotY=SetNAN(quotY);
+        }
+		//quotY=adjs((NumY<<4)/den, FLOW_BITS);        
+
+        //divide12(NumX, den, quotX);
+        //divide12(NumY, den, quotY);
+
+		// delay for threshold
+        par(i=1; i<THlat; i++)
+		{
+			nc[i]=nc[i-1];
+		}
+        par(n=1; n<DIVLATENCY; n++)
+		{
+			NumX[n]=NumX[n-1];
+            NumY[n]=NumY[n-1];            
+		}
+		par(s=1; s<SUMlat; s++)
+		{
+			sumX[s]=sumX[s-1];
+			sumY[s]=sumY[s-1];
+		}
+        
+        //  Pipeline 15
+        if (nc[THlat-1]>=nc_min)
+        par
+        {
+            Ox= quotX;    //       den = (sumXYL_22-sumXXL_2*sumYYL_2)
+            //Ox= (-NumX>>(lmo(den<-(width(den)-1))))<-FLOW_BITS;
+            //Ox= (-NumX);
+            Oy= quotY;
+            //Oy= ( NumY>>(lmo(den<-(width(den)-1))))<-FLOW_BITS;
+            //Oy= (NumY);
+        }
+        else
+        par
+        {
+            Ox=SetNAN(Ox);   //in matlab is NaN
+            Oy=SetNAN(Oy);   //in matlab is NaN
+        }
+
+    }
+}
+
+/*
+// Invert function : DIVLATENCY cycles
+// -----------------------------
+macro proc invert(Den, Div_thr, quot)
+{	
+    //signed int DIVIDER_INPUT Den_p0;    
+	unsigned int (log2ceil(width(Den))) MSB_Den;        
+    //static signed int 14 one = 0b01000000000000;
+    static signed int 18 one = 0b010000000000000000;
+    unsigned 1 cond[DIVIDER_LATENCY];
+    // Enable for Cores
+    static signal unsigned 1 enable;
+
+    interface divider_18 (signed DIVIDER_INPUT quot, signed DIVIDER_INPUT remd, unsigned 1 rfd) divider(signed  DIVIDER_INPUT dividend = one, 
+							signed DIVIDER_INPUT divisor = adjs(Den,DIVIDER_INPUT), unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+    
+    par
+    {          
+        //Enabling atan2 Core: only for 1 clock cycle
+        enable=1;
+        
+        cond[0]=(abs(Den)<((signed)(0@Div_thr)));
+        par(i=1;i<DIVIDER_LATENCY;i++)
+        {
+            cond[i]=cond[i-1];
+        }
+        if(cond[DIVIDER_LATENCY-1]==1)
+            quot = 0;
+        else
+            quot = divider.quot;
+    }
+}
+*/
+// Divide function
+// -----------------------------
+macro proc divide12(Num, Den, quot)
+{
+	signed int (width(Num)) Num_p0, Num_p1, Den_p0, Den_p1;
+    signed int (width(quot)) Num_p2, Den_p2[DIVIDER_LATENCY];
+	unsigned int (log2ceil(width(Num))) shift, MSB_Num, MSB_Den;
+    unsigned int 1 NotValid, NotValid_2[DIVIDER_LATENCY];
+	
+
+	interface divider_12 (signed FLOW_BITS quot, signed FLOW_BITS remd, unsigned 1 rfd) divider(signed  FLOW_BITS dividend = Num_p2, 
+							signed FLOW_BITS divisor = Den_p2[0], unsigned 1 clk=__clock, unsigned 1 ce=1) with {busformat="B<I>"};
+	//*/
+    par
+    {
+        // Pipeline 0
+		if(Num>0)
+            MSB_Num = lmo(Num);
+        else
+            if(Num==0)
+                MSB_Num = 0;
+            else
+                MSB_Num = lmo(-Num);
+        
+        if(Den>0)
+            MSB_Den = lmo(Den);
+        else
+            if(Den==0)
+                MSB_Den = 0;
+            else
+                MSB_Den = lmo(-Den);
+        
+		Den_p0 = Den;
+        Num_p0 = Num;
+        
+        
+        //  Pipeline 1        
+        Num_p1 = Num_p0;        
+        Den_p1 = Den_p0;
+
+        if( (abs(MSB_Num-MSB_Den))>=(width(quot)-2) )
+            NotValid=1;
+        else
+            NotValid=0;
+
+        if(MSB_Num > MSB_Den)
+        shift = width(Num)-MSB_Num-2 ;
+        else
+        shift = width(Num)-MSB_Den-2 ;
+
+		// Pipeline 2
+        NotValid_2[0] = NotValid;
+        //my_dividend = (Num_p1<<shift)\\(width(Num)-width(quot));
+        Num_p2 = (Num_p1<<shift)\\(width(Num)-width(quot));
+        //my_divisor = (Den_p1<<shift)\\(width(Num)-width(quot));
+        Den_p2[0] = (Den_p1<<shift)\\(width(Num)-width(quot));
+
+        // delay for thresholds
+        par(i=1;i<DIVIDER_LATENCY;i++)
+        {
+            NotValid_2[i] = NotValid_2[i-1];            
+            Den_p2[i] = Den_p2[i-1];
+        }
+        // Pipeline 3
+        if(NotValid_2[DIVIDER_LATENCY-1]==0 && Den_p2[DIVIDER_LATENCY-1]!=0)
+        par
+		{			
+			quot = divider.quot;
+			//quot = (Num_p2)/(Den_p2);
+        }
+		else
+        quot = SetNAN(quot);
+    }
+}
+
+
+
+//--------------------------------------------
+// ¡¡¡¡¡¡¡¡¡¡¡  Pay attention  !!!!!!!!!!!!!!
+// Sign on output is not changed
+//-----------------------------------------------
+macro proc new_full_velocity(FV, LE,thres,nc_min, Ox, Oy)
+{    
+    macro expr Frac=4;
+    macro expr DIVLATENCY=DIVIDER_LATENCY;  // added 1 for thresholding in invert function
+	macro expr THlat=DIVLATENCY+3;
+	//macro expr SUMlat=DIVLATENCY+2;    
+
+    unsigned int 4 nc[THlat];
+    unsigned int 1 bad_div[DIVLATENCY];
+    unsigned int 1 nc_par[NORIENTATIONS];
+    
+    //unsigned int 1 cond[NORIENTATIONS];
+    
+    
+    signed int 9 auxYY[NORIENTATIONS], auxXX[NORIENTATIONS], auxXY[NORIENTATIONS];
+    signed int (FLOW_BITS+4) Vx[NORIENTATIONS], Vy[NORIENTATIONS];
+    signed int (FLOW_BITS+7) sumX,sumY;
+    signed int (DIVIDER_INPUT) sumYYL_2, sumXXL_2, sumXYL_2;
+    signed int (DIVIDER_INPUT) aux_den_0, aux_den_1, aux_NumX_0, aux_NumX_1,aux_NumY_0, aux_NumY_1;
+    //signed int (DIVIDER_INPUT) NumX[DIVLATENCY], NumY[DIVLATENCY];
+    signed int (DIVIDER_INPUT) NumX, NumY;
+    signed int (DIVIDER_INPUT) den, den_1;
+    //signed int (DIVIDER_INPUT) diff[DIVLATENCY];
+   // signed int (DIVIDER_INPUT) diff;
+    signed int (DIVIDER_INPUT) quotX, quotY;	
+    
+    const int 9 WREAL_SUMX[NORIENTATIONS]	= {-81, -75, -58, -31, 0, 31, 58, 75};	//	2^7 * {-F0 * cos(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1} 
+	const int 9 WIMAG_SUMY[NORIENTATIONS]	= {0, -31, -58, -75, -81, -75, -58, -31};	//	2^7 * {-F0 * sin(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1}
+
+    const int 9 SUMXY_L2[NORIENTATIONS]	= {0, 45, 64, 45, 0, -45, -64, -45};	//	 FRAC: 7 bits
+	const int 9 SUMXX_L2[NORIENTATIONS]	= {128, 109, 64, 19, 0, 19, 64, 109};	// FRAC: 7 bits
+    const int 9 SUMYY_L2[NORIENTATIONS]	= {0, 19, 64, 109, 128, 109, 64, 19};	//	 FRAC: 7 bits
+        
+    //assert (NORIENTATIONS==8, 0, "The code function only for 8 orientations");
+
+    //------------------------------------------------------------
+    //  Verify bitwidth in operations
+    //  improve division (ex. divider core)
+    //------------------------------------------------------------
+
+    par
+    {        
+		// Pipeline 1
+        par(o=0;o<NORIENTATIONS;o++)
+        {            
+            if(LE[o]<thres && FV[o]!=0 && FV[o]!=NAN)  
+            par
+            {
+                Vx[o]=(adjs(FV[o], width(Vx)+4)*adjs(WREAL_SUMX[o],width(Vx)+4))\\4; //s-7-8
+                Vy[o]=(adjs(FV[o], width(Vy)+4)*adjs(WIMAG_SUMY[o],width(Vy)+4))\\4; //s-7-8
+                auxXX[o]=adjs(SUMXX_L2[o],width(auxXX)); //s-0-7 bits
+                auxXY[o]=adjs(SUMXY_L2[o],width(auxXY)); //s-0-7 bits
+                auxYY[o]=adjs(SUMYY_L2[o],width(auxYY)); //s-0-7 bits
+                nc_par[o]=1;
+            }
+            else
+            par
+            {
+                Vx[o]=0;
+                Vy[o]=0;                                
+                auxXX[o]=0;
+                auxXY[o]=0;
+                auxYY[o]=0;
+                nc_par[o]=0;
+            }
+        }        
+
+        //Pipeline 2
+        par
+        {
+	        nc[0] = UnSumMacro(nc_par, 0, NORIENTATIONS-1, width(nc));
+	        sumX = SumMacro(Vx, 0, NORIENTATIONS-1, width(sumX));  //s-10-8
+	        sumY = SumMacro(Vy, 0, NORIENTATIONS-1, width(sumY));  //s-10-8
+            
+            sumYYL_2 = SumMacro(auxYY, 0, NORIENTATIONS-1, width(sumYYL_2));  //adjs(s-3-7,DIVIDERINPUT) 
+            sumXXL_2 = SumMacro(auxXX, 0, NORIENTATIONS-1, width(sumXXL_2));  //adjs(s-3-7,DIVIDERINPUT)
+            sumXYL_2 = SumMacro(auxXY, 0, NORIENTATIONS-1, width(sumXYL_2));  //adjs(s-3-7,DIVIDERINPUT)
+		}
+        
+        
+        //  Pipeline 3
+        par
+        {
+            //aux_den_0 = (adjs(sumXYL_2,width(aux_den_0)+6)*adjs(sumXYL_2,width(aux_den_0)+6))\\6; //s-9-8
+            //aux_den_1 = (adjs(sumXXL_2,width(aux_den_1)+6)*adjs(sumYYL_2,width(aux_den_1)+6))\\6; //s-9-8
+            aux_den_0 = (adjs(sumXYL_2,width(aux_den_0)+8)*adjs(sumXYL_2,width(aux_den_0)+8))\\8; //s-11-6
+            aux_den_1 = (adjs(sumXXL_2,width(aux_den_1)+8)*adjs(sumYYL_2,width(aux_den_1)+8))\\8; //s-11-6
+            
+            //aux_NumX_0 = ((adjs(sumX,width(aux_NumX_0)+7+4)*adjs(sumYYL_2,width(aux_NumX_0)+7+4))\\7)<-DIVIDER_INPUT;  //s-9-8
+            //aux_NumX_1 = ((adjs(sumY,width(aux_NumX_1)+7+4)*adjs(sumXYL_2,width(aux_NumX_1)+7+4))\\7)<-DIVIDER_INPUT;  //s-9-8
+            //aux_NumY_0 = ((adjs(sumX,width(aux_NumY_0)+7+4)*adjs(sumXYL_2,width(aux_NumY_0)+7+4))\\7)<-DIVIDER_INPUT;  //s-9-8
+            //aux_NumY_1 = ((adjs(sumY,width(aux_NumY_1)+7+4)*adjs(sumXXL_2,width(aux_NumY_1)+7+4))\\7)<-DIVIDER_INPUT;  //s-9-8
+            aux_NumX_0 = ((adjs(sumX,width(aux_NumX_0)+7+4)*adjs(sumYYL_2,width(aux_NumX_0)+7+4))\\3)<-DIVIDER_INPUT;  //s-5-12
+            aux_NumX_1 = ((adjs(sumY,width(aux_NumX_1)+7+4)*adjs(sumXYL_2,width(aux_NumX_1)+7+4))\\3)<-DIVIDER_INPUT;  //s-5-12
+            aux_NumY_0 = ((adjs(sumX,width(aux_NumY_0)+7+4)*adjs(sumXYL_2,width(aux_NumY_0)+7+4))\\3)<-DIVIDER_INPUT;  //s-5-12
+            aux_NumY_1 = ((adjs(sumY,width(aux_NumY_1)+7+4)*adjs(sumXXL_2,width(aux_NumY_1)+7+4))\\3)<-DIVIDER_INPUT;  //s-5-12
+        }
+        
+        //  Pipeline 4        
+        
+        par
+        {
+            //invert((aux_den_0 - aux_den_1), den);            
+            NumX = -(aux_NumX_0 - aux_NumX_1); //s-9-8
+            NumY = (aux_NumY_0 - aux_NumY_1);
+            den = aux_den_0 - aux_den_1;
+        }
+        
+        // Pipeline 5+divlatency
+        invert(NumX, den, quotX); 
+        invert(NumY, den, quotY); 
+        if(den!=0)
+        par
+        { 
+            bad_div[0]=0;
+        }
+        else
+        par
+        {
+            bad_div[0]=1;
+        }      
+        		
+		// delays for threshold and div
+        par(i=1; i<THlat; i++)
+		{
+			nc[i]=nc[i-1];
+		}
+        par(d=1; d<DIVLATENCY; d++)
+		{
+			bad_div[d]=bad_div[d-1];
+		}        
+        //  Pipeline 6 + divlatency
+        if (nc[THlat-1]>=nc_min && bad_div[DIVLATENCY-1]==0)
+        par
+        {
+            Ox= (quotX <- FLOW_BITS)>>2;
+            Oy= (quotY <- FLOW_BITS)>>2;            
+        }
+        else
+        par
+        {
+            Ox=SetNAN(Ox);   //in matlab is NaN
+            Oy=SetNAN(Oy);   //in matlab is NaN
+        }    
+    }
+}
+
+
+// Invert function : DIVLATENCY cycles
+// -----------------------------
+macro proc invert(Num, Den, quot)
+{	
+    //signed int DIVIDER_INPUT Den_p0;    
+	//unsigned int (log2ceil(width(Den))) MSB_Den;        
+    //static signed int 14 one = 0b01000000000000;
+    //static signed int 18 one = 0b010000000000000000;
+    //unsigned 1 cond;//[DIVIDER_LATENCY];
+    // Enable for Cores
+    static signal unsigned 1 enable;
+    
+    interface divider_18 (signed DIVIDER_INPUT quot, signed DIVIDER_INPUT remd, unsigned 1 rfd) divider(signed  DIVIDER_INPUT dividend = Num, 
+							signed DIVIDER_INPUT divisor = adjs(Den,DIVIDER_INPUT), unsigned 1 clk=__clock, unsigned 1 ce=enable) with {busformat="B<I>"};
+    
+    par
+    {          
+        //Enabling atan2 Core: only for 1 clock cycle
+        enable=1;
+        
+        //cond[0]=(abs(Den)<((signed)(0@Div_thr)));
+        //cond=(abs(Den)<((signed)(0@Div_thr)));
+        /*par(i=1;i<DIVIDER_LATENCY;i++)
+        {
+            cond[i]=cond[i-1];
+        }
+        
+        if(cond[DIVIDER_LATENCY-1]==1)*/
+        //if(cond==1)
+        //if(Den<12 && Den>-12)
+        //    quot = 0;
+        //else
+            quot = divider.quot;
+            //quot = ((signed)one) / adjs(Den,width(quot));        
+    }
+}
+
+//***************************************************
+//Macro component_velocity
+//
+//LATENCY = 4;
+//
+//
+//bits format: 	
+//IN:			P[NFRAMES][NORIENT]					--> sign-4-5
+//	
+//OUT:			FV[NFRAMES]                      	--> sign-14-5
+//				LE[NFRAMES]							--> sign-28-5
+//				
+//***************************************************/
+macro proc new_component_velocity(P, FV, LE){
+
+	//***********************************************/
+	//Constant definitions for 3 frames
+	//***********************************************
+	macro expr PSize			= 10;
+	macro expr SXX  			= 14;
+	macro expr SX 				= 6;
+	macro expr DEN 		    	= 6;	
+	const int 3 XX[NFRAMES]		= {1, 2, 3}; //XX3 is XX in the third dimension
+	const int 8 WREAL[NORIENTATIONS]	= {-81, -75, -58, -31, 0, 31, 58, 75};	//	25 * {-F0 * cos(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1} 
+	const int 8 WIMAG[NORIENTATIONS]	= {0, -31, -58, -75, -81, -75, -58, -31};	//	25 * {-F0 * sin(PI/(NORIENT*i)) / (2*PI*F0*F0)} i= {0,1 ... NORIENT-1}
+
+	//***********************************************
+	//Declarations: Variables
+	//***********************************************
+	int (PSize+2) Sxy[NORIENTATIONS];
+    int (PSize+1) Sxy_0[NORIENTATIONS][NFRAMES]; 
+	int (PSize+1) Sy[NORIENTATIONS];
+    int PSize Sy_0[NORIENTATIONS][NFRAMES];	
+
+	int (PSize+3) a[NORIENTATIONS]; 
+    int (PSize+5) a_0[NORIENTATIONS];
+    int (PSize+5) a_1[NORIENTATIONS];
+    int (PSize+12) a_2[NORIENTATIONS];
+    
+	int (PSize+2) b[NORIENTATIONS];  
+    int (PSize+4) b_0[NORIENTATIONS];
+    int (PSize+4) b_1[NORIENTATIONS];
+    int (PSize+11) b_2[NORIENTATIONS];
+    
+
+
+    int (PSize+3) a3_0[NORIENTATIONS];  
+    
+    int (PSize+2)  bs3[NORIENTATIONS];
+    int (PSize+2)  bs3_1[NORIENTATIONS]; 
+    int (PSize+2)  bs4_0[NORIENTATIONS]; 
+    int (PSize+2)  bs4_1[NORIENTATIONS]; 
+	
+
+
+    int (PSize+3) Reg[NFRAMES][NORIENTATIONS];
+    int (PSize+3) Reg_0[NFRAMES][NORIENTATIONS]; 
+	
+	//Pipeline auxiliary variables
+	int PSize Ps0[NFRAMES][NORIENTATIONS];
+    int PSize Ps1[NFRAMES][NORIENTATIONS];
+	int PSize Ps2[NFRAMES][NORIENTATIONS];
+    int PSize Ps2_1[NFRAMES][NORIENTATIONS];
+    int PSize Ps2_2[NFRAMES][NORIENTATIONS];
+	int PSize Ps3[NFRAMES][NORIENTATIONS];
+    int PSize Ps3_1[NFRAMES][NORIENTATIONS];
+	 
+    
+    int (PSize+3) LE_0[NORIENTATIONS][NFRAMES];
+    int (2*PSize) LE_1[NORIENTATIONS][NFRAMES];
+
+	
+	//***********************************************
+	//Body of the function
+	//***********************************************
+	par(orien=0;orien<NORIENTATIONS;orien++)
+	{
+	
+		//Pipeline Stage 0
+        par(f=0;f<NFRAMES;f++)
+        {
+            Sxy_0[orien][f] = adjs(P[f][orien],width(Sxy_0))*adjs(XX[f],width(Sxy_0));
+            Sy_0[orien][f] = P[f][orien];
+
+            //Copying P for the next stage
+            Ps0[f][orien]=P[f][orien];
+        }
+        //Pipeline Stage 1
+		par
+		{
+			//Sxy[orien] = (adjs(P[0][orien],width(Sxy))*adjs(XX[0],width(Sxy)) + adjs(P[1][orien],width(Sxy))*adjs(XX[1],width(Sxy)) + adjs(P[2][orien],width(Sxy))*adjs(XX[2],width(Sxy)) + adjs(P[3][orien],width(Sxy))*adjs(XX[3],width(Sxy)) + adjs(P[4][orien],width(Sxy))*adjs(XX[4],width(Sxy)));	
+            Sxy[orien] = SumMacro(Sxy_0[orien], 0, NFRAMES-1,width(Sxy));
+			//Sy[orien]  = adjs(P[0][orien],width(Sy)) + adjs(P[1][orien],width(Sy)) + adjs(P[2][orien],width(Sy)) + adjs(P[3][orien],width(Sy)) + adjs(P[4][orien],width(Sy));
+            Sy[orien] = SumMacro(Sy_0[orien], 0, NFRAMES-1,width(Sy));
+
+			//Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps1[f][orien]=Ps0[f][orien];
+			}
+		}
+
+		//Pipeline Stage 2_0
+        par
+        {
+            a_0[orien] = SXX*adjs(Sy[orien],width(a_0));
+            b_0[orien] = NFRAMES*adjs(Sxy[orien],width(b_0));
+                    
+            a_1[orien] = SX*adjs(Sxy[orien],width(a_1));
+            b_1[orien] = SX*adjs(Sy[orien],width(b_1));
+
+            //Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2[f][orien]=Ps1[f][orien];
+			}
+        } 
+        //Pipeline Stage 2_1
+        par
+        {
+            //a_2[orien] = (a_1[orien]-a_2[orien])*5;  // 5 frames
+            //b_2[orien] = (b_1[orien]-b_2[orien])*5;  // multiplied by 5 for following /50 division that become <<8 : 5/256 ~= 1/50
+            a_2[orien] = (adjs(a_0[orien],width(a_2))-adjs(a_1[orien],width(a_2)));    // 3 frames
+            b_2[orien] = (adjs(b_0[orien],width(b_2))-adjs(b_1[orien],width(b_2)));
+
+            //Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2_1[f][orien]=Ps2[f][orien];
+			}
+        } 
+        //Pipeline Stage 2_2
+		par
+		{
+			//Using 5 decimals for a and b (*25)
+			//a[orien] = (adjs(SXX,PSize+15)*32*adjs(Sy[orien],PSize+15) - adjs(SX,PSize+15)*32*adjs(Sxy[orien],PSize+15))/adjs(DEN,PSize+15);
+            //a[orien] = a_2[orien]<<8;  //for 5 frames
+            //a[orien] = a_2[orien]<<3;  //for 3 frames
+			//a[orien] = adjs(((a_2[orien])*21)>>7,width(a));  //for 3 frames
+            a[orien] = adjs((a_2[orien]*43)>>8,width(a));  //for 3 frames
+            
+			//b[orien] = adjs((NFRAMES*32*adjs(Sxy[orien],PSize+13) - adjs(SX,PSize+13)*32*adjs(Sy[orien],PSize+13))/adjs(DEN,PSize+13), width(b));
+            //b[orien] = adjs(b_2[orien]<<8,width(b));  // for 5 frames
+            //b[orien] = adjs(((b_2[orien])*21)>>7,width(b));  // for 3 frames
+            b[orien] = adjs((b_2[orien]*43)>>8,width(b));  // for 3 frames
+            
+			
+			//Copying P for the next stage
+			par(f=0;f<NFRAMES;f++)
+			{
+				Ps2_2[f][orien]=Ps2_1[f][orien];
+			}
+		}
+
+		//Pipeline Stage 3_0
+        par
+        {
+            par(f=0;f<NFRAMES;f++)
+            {
+                Reg_0[f][orien] = adjs(b[orien],width(Reg_0))*adjs(XX[f],width(Reg_0));
+            
+                //Copying P for the next stage
+    		    Ps3[f][orien]=Ps2_2[f][orien];
+            }
+            //Copying b for the next stage
+			bs3[orien]=b[orien];
+            a3_0[orien] = a[orien];
+        }
+        //Pipeline Stage 3_1
+		par
+		{
+			par(f=0;f<NFRAMES;f++)
+			{
+				//Reg[fr][orien] = adjs(a[orien],width(Reg))+ adjs(b[orien],width(Reg))*adjs(XX[fr],width(Reg));
+                Reg[f][orien] = adjs(a3_0[orien],width(Reg))+ adjs(Reg_0[f][orien],width(Reg));
+			
+				//Copying P for the next stage
+				Ps3_1[f][orien]=Ps3[f][orien];
+			}
+			
+			//Copying b for the next stage
+			bs3_1[orien]=bs3[orien];
+		}
+	
+		//Pipeline Stage 4_0
+        par
+        {
+            par(f=0;f<NFRAMES;f++)
+            {
+                LE_0[orien][f] = adjs(Reg[f][orien],width(LE_0)) - adjs(Ps3_1[f][orien], width(LE_0));
+            }
+            //Copying b for the next stage
+            bs4_0[orien]=bs3_1[orien];
+        }
+        //Pipeline Stage 4_1
+            par
+            {
+            par(f=0;f<NFRAMES;f++)
+            {
+                LE_1[orien][f] = adjs(LE_0[orien][f],width(LE_1))*adjs(LE_0[orien][f],width(LE_1));
+            }
+            //Copying b for the next stage
+            bs4_1[orien]=bs4_0[orien];
+        }
+        //Pipeline Stage 4_2
+		par
+		{
+			//LE[orien] = adjs(((((adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien], 2*PSize+26)*32)*(adjs(Reg[0][orien],2*PSize+26)- adjs(Ps3[0][orien],2*PSize+26)*32) + (adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32)*(adjs(Reg[1][orien], 2*PSize+26)- adjs(Ps3[1][orien],2*PSize+26)*32) + (adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32)*(adjs(Reg[2][orien],2*PSize+26)- adjs(Ps3[2][orien],2*PSize+26)*32) + (adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32)*(adjs(Reg[3][orien],2*PSize+26)- adjs(Ps3[3][orien],2*PSize+26)*32) + (adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32)*(adjs(Reg[4][orien],2*PSize+26)- adjs(Ps3[4][orien],2*PSize+26)*32))/NFRAMES)\\15), width(LE));
+            //LE[orien] = adjs(((SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize)*21)>>6)\\2,width(LE));
+            LE[orien] = adjs(((SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize+2)*85)>>8)\\2,width(LE));
+			//LE[orien]=adjs(((LE_1[0][orien]+LE_1[1][orien]+LE_1[2][orien])>>2)\\15,width(LE));
+
+			//LE[orien] = adjs(SumMacro(LE_1[orien],0, NFRAMES-1,2*PSize+26)\\15, width(LE));
+			
+			//Simplifying the equation: FVreal = - (F0*cos(ang)/2*PI)*b[orien] --> FVreal = Wreal[orien]*b[orien] //Wreal is initialised with factor 25
+			//							FVimag = - (F0*sin(ang)/2*PI)*b[orien] --> FVimag = Wimag[orien]*b[orien] //Wreal is initialised with factor 25
+			
+			//FVreal[orien]= (adjs(bs4_1[orien],PSize+8)*adjs(WREAL[orien],PSize+8))\\6; //final size of FVreal is PSize+18
+			//FVimag[orien]= (adjs(bs4_1[orien],PSize+8)*adjs(WIMAG[orien],PSize+8))\\6; //final size of FVimag is PSize+18
+			//FVreal[orien]= ((adjs(bs4_1[orien],PSize+8)*adjs(WREAL[orien],PSize+8))\\2)<-width(FVreal); //final size of FVreal is PSize+18
+			//FVimag[orien]= ((adjs(bs4_1[orien],PSize+8)*adjs(WIMAG[orien],PSize+8))\\2)<-width(FVimag); //final size of FVimag is PSize+18
+            //FVreal[orien]= ((adjs(bs4_1[orien],PSize+10)*adjs(WREAL[orien],PSize+10))\\4)<-width(FVreal); //final size of FVreal is PSize+18
+			//FVimag[orien]= ((adjs(bs4_1[orien],PSize+10)*adjs(WIMAG[orien],PSize+10))\\4)<-width(FVimag); //final size of FVimag is PSize+18
+            
+            FV[orien]=adjs(bs4_1[orien], width(FV)); //Only 5 bits for Frac !!!!
+		}
+	}
+}
+
--- a/attention/attention_v0.1/opticflow.hch
+++ b/attention/attention_v0.1/opticflow.hch
+#ifndef __OPTICFLOW__
+#define __OPTICFLOW__
+
+#include "GaborPrimitives.hch"
+
+#define NFRAMES 3
+#define NORIENTATIONS 8
+#define ATAN2WIDTH 	 10 //19 //24
+#define ATAN2OUTWIDTH 	 10 //9 //19 //24
+#define ATAN2LATENCY (ATAN2OUTWIDTH+4)
+#define ATAN2NAME atan2_10bit //atan2_19bit
+#define DIVIDER_NAME divider_21 
+#define DIVIDER_NAME_2 divider_27
+#define DIVIDER_INPUT 18
+#define DIVIDER_LATENCY DIVIDER_INPUT+4 // is +4 if divider has clks/div==1
+
+#define FLOW_BITS 12
+#define THRESHOLD 16
+#define NC_MIN 4
+#define EPS 0
+#define NAN 0b100000000000
+
+// CORES
+macro proc CoreATAN2CORDIC_fl(y, x, enable, angle);
+macro proc CoreDIVIDER(my_dividend, my_divisor, result, enable);
+macro proc CoreDIVIDER_2(my_dividend, my_divisor, result, enable);
+
+//***************************************************
+//Macro component_velocity
+//***************************************************
+macro proc component_velocity(P, FVreal, FVimag, LE);
+macro proc component_velocity_mia(P, FVreal, FVimag, LE);
+macro proc new_component_velocity(P, FV, LE);
+
+
+//***************************************************
+//Macro compute_phase
+//***************************************************
+macro proc compute_phase(Greal, Gimag, P);
+macro proc compute_single_phase(Greal, Gimag, P);
+
+//***************************************************
+//Macro unwrap
+//***************************************************
+macro proc unwrap(Pin, Pout);
+macro proc unwrap_3(Pin, Pout);
+
+//***************************************************
+//Macro full_velocity
+//***************************************************
+macro proc full_velocity(FVx,FVy,LE,thres,nc_min, enable, Ox, Oy);
+macro proc full_velocity_small(FVx,FVy,LE,thres, Div_thr, nc_min, Ox, Oy);
+macro proc new_full_velocity(FV, LE,thres, nc_min, Ox, Oy);
+
+macro proc divide12(Num, Den, quot);
+
+macro proc invert(Num, Den, quot);
+
+
+//***************************************************
+//Resource sharing functions
+//***************************************************
+macro proc compute_phase_top(Greal,Gimag, P, index);
+void function_compute_phase(signed int F_BITS (*Greal),signed int F_BITS (*Gimag), signed int 9 *P);
+macro proc compute_phase_index(Greal, Gimag, P);
+
+#endif
\ No newline at end of file
--- a/attention/attention_v0.1/parameters.hch
+++ b/attention/attention_v0.1/parameters.hch
+/* parameters.hch
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/
+
+#ifndef __PARAMETERS__
+#define __PARAMETERS__
+
+// Number of cameras (1 for single camera, 2 for stereo system)
+//#define NCAMERAS 2 
+
+// Max image resolution
+#define MAX_RES_X 1024
+#define MAX_RES_Y 1024
+#define MAX_IMSIZE (MAX_RES_X*MAX_RES_Y) 
+
+//Number of frames we are using
+#define NFRAMES 3
+
+#endif
\ No newline at end of file
--- a/optical_flow/gradient_based_method/flow_v0.1/cores.hcc
+++ b/optical_flow/gradient_based_method/flow_v0.1/cores.hcc
@@ -20,7 +20,7 @@
 %	ImSize		- Size of the input images
 %
 %	DESCRIPTION
-%				Interface for a top architecture to interface with the disparity estimation core
+%				Interface for a top architecture to interface with the optic flow estimation core
 % RETURN
 %   
 */

--- a/optical_flow/gradient_based_method/flow_v0.1/cores.hch
+++ b/optical_flow/gradient_based_method/flow_v0.1/cores.hch
@@ -8,7 +8,7 @@

 #include "stdlib.hch"
 #include "channels.hch"
-//#include "xircav4_lib.hch" Platform-dependent
+//#include "xircav4_lib.hch" //Platform-dependent

 #define CORE 1 // 0 for sub-circuit test, 1 for core calls


--- a/optical_flow/gradient_based_method/flow_v0.1/lklib.hcc
+++ b/optical_flow/gradient_based_method/flow_v0.1/lklib.hcc
+/* lklib.hcc
+%   Copyright (C) 2014  Francisco Barranco, 09/02/2014, University of Granada-University of Maryland.
+%   License, GNU GPL, free software, without any warranty.
+*/

 #include "lklib.hch"
 #include "cores.hch"