aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHans-Nikolai Viessmann2016-07-13 17:27:54 +0100
committerHans-Nikolai Viessmann2016-07-13 17:27:54 +0100
commitc542ab4718a3922fb5cafe0db49ec48b606cb119 (patch)
tree7864dba623c1043319f5fe689f2a529465448c29
parent5af21293df2bec349b2c3c89f781d9454594b4e0 (diff)
downloadopenspl-16-c542ab4718a3922fb5cafe0db49ec48b606cb119.tar.gz
openspl-16-c542ab4718a3922fb5cafe0db49ec48b606cb119.zip
Finally some MaxJ Code....x
-rw-r--r--exercise2-classification/RadiusEngineParameters.maxj50
-rw-r--r--exercise2-classification/RadiusKernel.maxj123
-rw-r--r--exercise2-classification/RadiusManager.maxj125
-rw-r--r--exercise2-classification/cpu_code.c239
4 files changed, 537 insertions, 0 deletions
diff --git a/exercise2-classification/RadiusEngineParameters.maxj b/exercise2-classification/RadiusEngineParameters.maxj
new file mode 100644
index 0000000..c84f8e7
--- /dev/null
+++ b/exercise2-classification/RadiusEngineParameters.maxj
@@ -0,0 +1,50 @@
+package radius;
+
+import com.maxeler.maxcompiler.v2.build.EngineParameters;
+
+public class RadiusEngineParameters extends EngineParameters {
+
+ public RadiusEngineParameters(String[] args) {
+ super(args);
+ }
+
+ private static final String s_streamFrequency = "streamFrequency";
+ private static final String s_dimensions = "dimensions";
+ private static final String s_numPipes = "numPipes";
+ private static final String s_maxClasses = "s_maxClasses";
+
+ @Override
+ protected void declarations() {
+ declareParam(s_streamFrequency, DataType.INT, 100);
+ declareParam(s_dimensions, DataType.INT, 8);
+ declareParam(s_numPipes, DataType.INT, 32);
+ declareParam(s_maxClasses, DataType.INT, 32768); //making this value less that numPipes * 512 will not save any BRAMs, but making it numPipes * 513 will double it.
+ }
+
+ @Override
+ protected void validate() {
+ if (getStreamFrequency() <= 0) {
+ throw new IllegalArgumentException("Stream frequency should be > 0.");
+ }
+ if (getNumPipes() < 1) {
+ throw new IllegalArgumentException("Number of pipes must be at least 1.");
+ }
+ if (getMaxClasses() % getNumPipes() != 0) {
+ throw new IllegalArgumentException("Maximum number of classes must be a multiple of the number of pipes.");
+ }
+ }
+
+ public int getStreamFrequency() { return getParam(s_streamFrequency); }
+ public int getDimensions() { return getParam(s_dimensions); }
+ public int getNumPipes() { return getParam(s_numPipes); }
+ public int getMaxClasses() { return getParam(s_maxClasses); }
+
+ @Override
+ public String getBuildName() {
+ return getMaxFileName() + "_" +
+ getTarget() + "_" +
+ getDimensions() + "D_" +
+ getNumPipes() + "pipes_" +
+ getStreamFrequency() + "MHz";
+ }
+}
diff --git a/exercise2-classification/RadiusKernel.maxj b/exercise2-classification/RadiusKernel.maxj
new file mode 100644
index 0000000..1c5da4c
--- /dev/null
+++ b/exercise2-classification/RadiusKernel.maxj
@@ -0,0 +1,123 @@
+package radius;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import com.maxeler.maxcompiler.v0.utils.MathUtils;
+import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel;
+import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.KernelMath;
+import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters;
+import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.memory.Memory;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEFloat;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.composite.DFEVector;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.composite.DFEVectorType;
+
+
+public class RadiusKernel extends Kernel {
+
+ public static final DFEType inputType = dfeFloat(8, 24);
+
+ private final int m_numPipes;
+ private final int m_dimensions;
+
+ protected RadiusKernel(KernelParameters parameters, int dimensions, int numPipes, int maxClasses) {
+ super(parameters);
+ m_numPipes = numPipes;
+ m_dimensions = dimensions;
+ applyOptimizations();
+
+ DFEType addressType = dfeUInt(MathUtils.bitsToAddress(maxClasses / numPipes));
+ DFEVar cyclesPerPoint = io.scalarInput("cyclesPerPoint", addressType); // numClasses / numPipes
+ DFEVar address = control.count.makeCounterChain().addCounter(cyclesPerPoint, 1); // Count up to cyclesPerPoint
+
+ DFEVector<DFEVar> input = io.input("a", new DFEVectorType<DFEVar>(inputType, dimensions), address === 0);
+ List<DFEVar> centres = createCentresRoms(maxClasses, inputType, address);
+ List<DFEVar> radii2 = createRadii2Roms(maxClasses, inputType, address);
+// DFEVar distinf = inputType.newInstance(this); // for L-inf norm
+
+ //Declare array for Euclidean distances
+ DFEVector<DFEVar> output = (new DFEVectorType<DFEVar>(dfeBool(), numPipes)).newInstance(this);
+ // Compute the sum of each row of squared differences
+ for(int pipe = 0 ; pipe < numPipes; pipe++) {
+ //DFEVar[] summands = new DFEVar[dimensions];
+ DFEVar distinf = constant.zero(inputType); // for L-inf norm
+
+// distinf <== 0;
+ //Determine the squared difference between each data point in the matrix and its corresponding input element
+ for (int dim = 0; dim < dimensions; dim++) {
+ //MaxCompiler does not do common sub-expression elimination, as sometimes you will want to deliberately
+ //duplicate compute. This means that we do "diff = a-b; diff * diff" instead of "(a-b)*(a-b)" to avoid
+ //creating two subtraction nodes and using extra LUTs.
+ DFEVar diff = input[dim] - centres[pipe * m_dimensions + dim];
+ //summands[dim] = diff * diff; // L2-Norm
+ //summands[dim] = abs(diff);
+ distinf = KernelMath.max(distinf, KernelMath.abs(diff));
+ }
+ // If the distance squared is less than the cluster radius squared, update output with the index, if not, -1
+ //output[pipe] <== adderTree(summands) < radii2[pipe];
+ output[pipe] <== distinf < radii2[pipe];
+ }
+
+ //Output
+ io.output("c", output, output.getType());
+ }
+
+
+ //Declare individual mapped Roms to stream centres from data cluster matrix.
+ private List<DFEVar> createCentresRoms(int maxClasses, DFEType type, DFEVar address) {
+ List<DFEVar> output = new ArrayList<DFEVar>();
+ for (int i = 0; i < m_numPipes * m_dimensions; i++) {
+ Memory<DFEVar> mappedRom = mem.alloc(type, maxClasses / m_numPipes);
+ String name = String.format("centresRom%04d", i);
+ mappedRom.mapToCPU(name);
+ output.add(mappedRom.read(address));
+ }
+
+ return output;
+ }
+
+
+ //Declare individual mapped Roms to stream Radii from data cluster matrix.
+ private List<DFEVar> createRadii2Roms(int maxClasses, DFEType type, DFEVar address) {
+ List<DFEVar> output = new ArrayList<DFEVar>();
+ for (int i = 0; i < m_numPipes; i++) {
+ Memory<DFEVar> mappedRom = mem.alloc(type, maxClasses / m_numPipes);
+ String name = String.format("radii2Rom%04d", i);
+ mappedRom.mapToCPU(name);
+ output.add(mappedRom.read(address));
+ }
+
+ return output;
+ }
+
+
+ //In general, adder trees are more efficient than chains of adders as the latency is lower,
+ //and thus requires fewer resources for scheduling.
+ //A chain has latency O(n) while a tree has latency O(log2(n)).
+ private DFEVar adderTree(DFEVar... input) {
+ DFEVar[] sums = new DFEVar[input.length / 2 + (input.length % 2)];
+ for (int i = 0; i < input.length / 2; i++) {
+ sums[i] = input[2 * i] + input[2 * i + 1];
+ }
+ if (input.length % 2 == 1) {
+ sums[sums.length - 1] = input[input.length - 1];
+ }
+ return sums.length == 1 ? sums[0] : adderTree(sums);
+ }
+
+ //This is a floating point heavy design, so the following optimisation settings work well.
+ private void applyOptimizations() {
+ if (this.getManager().getManagerConfiguration().getBoardModel().isAltera()) {
+ //pipelining factor is between 0 and 1. By default the compiler builds all nodes
+ //with the maximum pipelining. In practice this is far too much for floating point units.
+ optimization.pushPipeliningFactor(0.5);
+ } else {
+ optimization.pushPipeliningFactor(0.2);
+ //Xilinx tools allow you to use extra DSPs to save LUTs on floating point additions and multiplications.
+ //When using single precision on Vectis you will generally run out of LUTs before you run out of DSPs.
+ optimization.pushDSPFactor(1.0);
+ }
+ }
+}
diff --git a/exercise2-classification/RadiusManager.maxj b/exercise2-classification/RadiusManager.maxj
new file mode 100644
index 0000000..2f0ef48
--- /dev/null
+++ b/exercise2-classification/RadiusManager.maxj
@@ -0,0 +1,125 @@
+package radius;
+
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEFloat;
+import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType;
+import com.maxeler.maxcompiler.v2.managers.BuildConfig;
+import com.maxeler.maxcompiler.v2.managers.BuildConfig.Effort;
+import com.maxeler.maxcompiler.v2.managers.BuildConfig.OptimizationTechnique;
+import com.maxeler.maxcompiler.v2.managers.custom.CustomManager;
+import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.CPUTypes;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.EngineInterface;
+import com.maxeler.maxcompiler.v2.managers.engine_interfaces.InterfaceParam;
+
+public class RadiusManager extends CustomManager {
+
+ private static final String s_kernelName = "radiusKernel";
+
+ public RadiusManager(RadiusEngineParameters params) {
+ super(params);
+ addMaxFileConstant("dimensions", params.getDimensions());
+ addMaxFileConstant("num_pipes", params.getNumPipes());
+ addMaxFileConstant("max_classes", params.getMaxClasses());
+
+ config.setDefaultStreamClockFrequency(params.getStreamFrequency());
+
+ KernelBlock block = addKernel(new RadiusKernel(
+ makeKernelParameters(s_kernelName), params.getDimensions(), params.getNumPipes(), params.getMaxClasses()));
+
+ for (String inputName : block.getAllInputs())
+ block.getInput(inputName).connect(addStreamFromCPU(inputName));
+
+ for (String outputName : block.getAllOutputs())
+ addStreamToCPU(outputName).connect(block.getOutput(outputName));
+
+ createSLiCinterface(modeDefault(params.getNumPipes(), params.getDimensions()));
+
+ configBuild(params);
+ }
+
+
+ private EngineInterface modeDefault(int numPipes, int dimensions) {
+ EngineInterface ei = new EngineInterface();
+ InterfaceParam P = ei.addParam("P", CPUTypes.UINT64);
+ InterfaceParam C = ei.addParam("C", CPUTypes.UINT64);
+
+ ei.setScalar(s_kernelName, "cyclesPerPoint", C / numPipes);
+ ei.setTicks(s_kernelName, (P * C) / numPipes);
+
+ CPUTypes inputType = getType(RadiusKernel.inputType);
+ InterfaceParam inputSize = inputType.sizeInBytes() * P * dimensions;
+ InterfaceParam outputSize = P * C / 8;
+
+ ei.setStream("a", inputType, inputSize);
+ ei.setStream("c", CPUTypes.UINT8, outputSize);
+ return ei;
+ }
+
+
+ private void configBuild(RadiusEngineParameters params) {
+ BuildConfig buildConfig = getBuildConfig();
+ buildConfig.setBuildEffort(params.getNumPipes() < 100 ? Effort.MEDIUM : Effort.HIGH);
+ buildConfig.setOptimizationGoal(OptimizationTechnique.AREA);
+ }
+
+
+ private CPUTypes getType(DFEType type) {
+ CPUTypes output;
+ if (type.isInt()) {
+ switch (type.getTotalBits()) {
+ case 8:
+ output = CPUTypes.INT8;
+ break;
+ case 16:
+ output = CPUTypes.INT16;
+ break;
+ case 32:
+ output = CPUTypes.INT32;
+ break;
+ case 64:
+ output = CPUTypes.INT64;
+ break;
+ default:
+ throw new RuntimeException("Integer inputs and outputs must be either 8, 16, 32 or 64 bits long.");
+ }
+ } else if (type.isUInt()) {
+ switch (type.getTotalBits()) {
+ case 8:
+ output = CPUTypes.UINT8;
+ break;
+ case 16:
+ output = CPUTypes.UINT16;
+ break;
+ case 32:
+ output = CPUTypes.UINT32;
+ break;
+ case 64:
+ output = CPUTypes.UINT64;
+ break;
+ default:
+ throw new RuntimeException("Unsigned inputs and outputs must be either 8, 16, 32 or 64 bits long.");
+ }
+ } else if (type instanceof DFEFloat) {
+ switch (type.getTotalBits()) {
+ case 32:
+ output = CPUTypes.FLOAT;
+ break;
+ case 64:
+ output = CPUTypes.DOUBLE;
+ break;
+ default:
+ throw new RuntimeException("Floating point inputs and outputs must be either float or double.");
+ }
+ } else {
+ throw new RuntimeException("Inputs and outputs must be either integers or floating point numbers.");
+ }
+
+ return output;
+ }
+
+
+ public static void main(String[] args) {
+ RadiusManager manager = new RadiusManager(new RadiusEngineParameters(args));
+ manager.build();
+ }
+}
diff --git a/exercise2-classification/cpu_code.c b/exercise2-classification/cpu_code.c
new file mode 100644
index 0000000..71eafe7
--- /dev/null
+++ b/exercise2-classification/cpu_code.c
@@ -0,0 +1,239 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "DFE.h"
+#include "MaxSLiCInterface.h"
+
+
+/*
+ * Constants
+ */
+
+#define NUM_CLASSES 512 // Number of classes
+#define NUM_POINTS (4*512) // Number of data points to be classified
+
+
+/*
+ * Types
+ */
+
+typedef struct __attribute__ ((__packed__)) {
+ unsigned bool0 : 1;
+ unsigned bool1 : 1;
+ unsigned bool2 : 1;
+ unsigned bool3 : 1;
+ unsigned bool4 : 1;
+ unsigned bool5 : 1;
+ unsigned bool6 : 1;
+ unsigned bool7 : 1;
+} packed_bool_t;
+
+
+/*
+// USE THIS CODE FOR STEP3
+static inline unsigned get_bool_from_packed(packed_bool_t *bools, size_t index, size_t num_pipes) {
+ size_t batch = index / (num_pipes * NUM_CLASSES);
+ size_t class = index % NUM_CLASSES;
+ size_t pipe = (index / NUM_CLASSES) % num_pipes;
+ size_t real_index = batch * (num_pipes * NUM_CLASSES) + class * num_pipes + pipe;
+ packed_bool_t b = bools[real_index / 8];
+ unsigned output[8] = { b.bool0, b.bool1, b.bool2, b.bool3,
+ b.bool4, b.bool5, b.bool6, b.bool7 };
+ return output[real_index % 8];
+}
+*/
+
+static inline unsigned get_bool_from_packed(packed_bool_t *bools, size_t index) {
+ packed_bool_t b = bools[index / 8];
+ unsigned output[8] = { b.bool0, b.bool1, b.bool2, b.bool3,
+ b.bool4, b.bool5, b.bool6, b.bool7 };
+ return output[index % 8];
+}
+
+
+static inline void *safe_malloc(size_t size) {
+ void *output = malloc(size);
+ if (!output) {
+ fprintf(stderr, "ERROR: failed to allocate array of size %lu\n", size);
+ exit(1);
+ }
+ return output;
+}
+
+
+float *random_array(int length) {
+ float *output = safe_malloc(length * sizeof(float));
+ for (int i = 0; i < length; i++) {
+ output[i] = random() % 100;
+ }
+ return output;
+}
+
+
+float *random_array_squared(int length) {
+ float *output = random_array(length);
+ for (int i = 0; i < length; i++) {
+ output[i] *= output[i];
+ }
+ return output;
+}
+
+
+/*
+ * Function: radius_cpu
+ * ------------------
+ * Calculates whether a given point belongs to each of the given classes.
+ * A class is an n-sphere and a point belongs to a class if it lies within its radius.
+ * In order to check whether a point lies within the radius of the class, we calculate
+ * the Euclidean distance between the point and the centre of the class. The point
+ * would lie within the class if this distance is less than the radius of the class.
+ *
+ * Arguments
+ * ---------
+ * class_centres: Centre points of all the classes
+ * class_radii2: Square radii of all the classes
+ * points: Data points to be classified
+ *
+ * Returns
+ * -------
+ * result: Belong-ness relation between the given data point and all the classes
+ */
+unsigned *radius_cpu(float *centres, float *radii2, float *points, int dimensions) {
+ printf("Running on CPU. dimensions = %d \n", dimensions);
+ double start = clock();
+
+ unsigned *result = safe_malloc(NUM_POINTS * NUM_CLASSES * sizeof(int));
+ for (int p = 0; p < NUM_POINTS; p++) {
+ // For each class, calculate the square euclidean distance between its centre and the given point
+ for (int c = 0; c < NUM_CLASSES; c++) {
+ //float dist1 = 0;
+ float distinf = 0;
+ // For each dimension, calculate the euclidean term and accumulate
+ for (int d = 0; d < dimensions; d++) {
+ float diff = centres[d + dimensions * c] - points[d + dimensions * p];
+ //dist1 += abs(diff);
+ distinf = fmax(distinf, abs(diff));
+ }
+ // If euclidean distance is smaller than the class radius, the data point belongs to the class
+ result[NUM_CLASSES * p + c] = (distinf < radii2[c]) ? 1 : 0;
+ }
+ }
+
+ printf("total time taken by CPU: %.3lf\n", (clock() - start) / CLOCKS_PER_SEC);
+
+ return result;
+}
+
+
+void set_mapped_roms(max_file_t *mf, DFE_actions_t *actions, float *centres, float *radii2) {
+ int dimensions = max_get_constant_uint64t(mf, "dimensions"); // Number of dimensions
+ int num_pipes = max_get_constant_uint64t(mf, "num_pipes"); //Number of classes checked in parallel on the DFE
+ int rom_depth = NUM_CLASSES / num_pipes;
+
+ // Initialise FMEMs.
+ double ** centreRom = (double**) &(actions->inmem_radiusKernel_centresRom0000);
+ for (int r = 0; r < dimensions * num_pipes; r++) {
+ centreRom[r] = safe_malloc(rom_depth * sizeof(double));
+ for (int i = 0; i < rom_depth; i++) {
+ centreRom[r][i] = centres[r + dimensions * num_pipes * i];
+ }
+ }
+
+ double ** radii2Rom = (double**) &(actions->inmem_radiusKernel_radii2Rom0000);
+ for (int r = 0; r < num_pipes; r++) {
+ radii2Rom[r] = safe_malloc(rom_depth * sizeof(double));
+ for (int i = 0; i < rom_depth; i++) {
+ radii2Rom[r][i] = radii2[r + num_pipes * i];
+ }
+ }
+}
+
+
+/*
+ * Function: radius_dfe
+ * ------------------
+ * The central idea of the DFE implementation is to load up the Class centres as well as radii on the FMEMs.
+ * In order to maximise the bandwidth for accessing the FMEMs, we are going to use multiple banks and interleave
+ * the data. We can do this here because the problem is embarrassingly parallel - evident from the fact that
+ * each invocation of the radiusCPU function is completely independent from the other.
+ *
+ * Arguments
+ * ---------
+ * class_centres: Centre points of all the classes
+ * class_radii2: Square radii of all the classes
+ * points: Data points to be classified
+ *
+ * Returns
+ * -------
+ * result: Belong-ness relation between the given data point and all the classes
+ */
+packed_bool_t *radius_dfe(float *centres, float *radii2, float *points, max_file_t *mf) {
+ packed_bool_t *out_dfe = safe_malloc(NUM_POINTS * NUM_CLASSES / 8);
+ max_engine_t * me = max_load(mf, "*");
+
+ DFE_actions_t actions;
+ actions.param_P = NUM_POINTS;
+ actions.param_C = NUM_CLASSES;
+ actions.instream_a = points;
+ actions.outstream_c = (uint8_t*)out_dfe;
+
+ set_mapped_roms(mf, &actions, centres, radii2);
+
+ printf("Running on DFE.\n");
+ double start = clock();
+ DFE_run(me, &actions);
+ printf("total time taken by DFE: %.3lf\n", (clock() - start) / CLOCKS_PER_SEC);
+ max_unload(me);
+
+ return out_dfe;
+}
+
+
+/*
+ * Important
+ * ---------
+ *
+ * Classes are n-spheres. They have a centre point - comprising N floats for an
+ * N-dimension problem space and radius-squared which is also a float. We store
+ * radius squared instead of radius because we only need to compare to the square
+ * of Euclidean distance so we save ourselves from having to calculate the square
+ * root of at least 2 numbers in every iteration.
+ */
+
+int main(void) {
+ // Seed the random number with time
+ srand(time(NULL));
+ // Open the maxfile (we need this now to get the number of dimensions).
+ max_file_t *mf = DFE_init();
+ int dimensions = max_get_constant_uint64t(mf, "dimensions"); // Number of dimensions
+ int max_classes = max_get_constant_uint64t(mf, "max_classes"); // Maximum number of classes the ROMs can hold
+// USE THIS FOR STEP3
+// int num_pipes = max_get_constant_uint64t(mf, "num_point_pipes"); // Number of points checked in parallel on the DFE
+ if (NUM_CLASSES > max_classes) {
+ fprintf(stderr, "ERROR: maxfile only supports %d classes, rebuild to support %d classes\n", max_classes, NUM_CLASSES);
+ exit(2);
+ }
+ if ((NUM_POINTS * NUM_CLASSES) % 128 != 0 || (NUM_POINTS * dimensions * sizeof(float)) % 16 != 0) {
+ fprintf(stderr, "ERROR: inputs and outputs must be 128 bit aligned.\n");
+ exit(3);
+ }
+
+ float *points = random_array(NUM_POINTS * dimensions); // The input data points that are to be classified
+ float *centres = random_array(dimensions * NUM_CLASSES); // The centre point of the classes
+ float *radii2 = random_array_squared(NUM_CLASSES); // The squared-radius of the classes
+
+ unsigned *out_cpu = radius_cpu(centres, radii2, points, dimensions);
+ packed_bool_t *out_dfe = radius_dfe(centres, radii2, points, mf);
+
+ // Check CPU code output against DFE output
+ for (unsigned i = 0; i < NUM_POINTS * NUM_CLASSES; i++) {
+ if (out_cpu[i] != get_bool_from_packed(out_dfe, i)) {
+ printf("%u %u %u\n", i, out_cpu[8 * i], get_bool_from_packed(out_dfe, i));
+ return 1;
+ }
+ }
+ printf("Done.\n");
+ return 0;
+}