diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index f0534b5ce..8bcbefdf8 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -713,12 +713,10 @@ void generator::visit_atomic_add_inst(ir::atomic_add_inst* add) { // vector size int vector_size = 1; - /* int ld = ptrs->get_order()[0]; unsigned alignment = alignment_->get(ptr, ld); vector_size = gcd(ptrs->axis(ld).contiguous, alignment); vector_size = std::min(vector_size, val->get_type()->get_tile_element_ty()->is_half_ty() ? 2 : 1); - vector_size = 1; std::map packets; for_each(val, [&](indices_t idx){ @@ -729,19 +727,18 @@ void generator::visit_atomic_add_inst(ir::atomic_add_inst* add) { packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size); }); - */ for_each(ptr, [&](indices_t idx){ unsigned linear = vals->get_linear_index(idx); unsigned id = linear / vector_size; - //if(linear % vector_size != 0) - // return; + if(linear % vector_size != 0) + return; // num bytes Value *rmw_ptr = ptrs->get_value(idx); Value *rmw_msk = msks->get_value(idx); - Value *rmw_val = vals->get_value(idx); - //if(vector_size == 1) - // rmw_val = builder_->CreateExtractElement(rmw_val, builder_->getInt32(0)); + Value *rmw_val = packets[id]; + if(vector_size == 1) + rmw_val = builder_->CreateExtractElement(rmw_val, builder_->getInt32(0)); Type* ty = rmw_val->getType(); size_t nbits = ty->getScalarSizeInBits(); // extract pointer offset diff --git a/lib/driver/stream.cc b/lib/driver/stream.cc index fd581b085..8d6762767 100755 --- a/lib/driver/stream.cc +++ b/lib/driver/stream.cc @@ -82,6 +82,7 @@ void host_stream::synchronize() { void host_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event, void **args, size_t args_size) { auto hst = kernel->module()->hst(); + hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]); char* params = new char[args_size]; std::memcpy((void*)params, (void*)args, args_size); for(size_t i = 0; i < grid[0]; i++) diff --git a/python/setup.py b/python/setup.py index 69e3aa58a..9346e73fe 100644 --- a/python/setup.py +++ b/python/setup.py @@ -55,7 +55,7 @@ class CMakeBuild(build_ext): self.build_extension(ext) def build_extension(self, ext): - self.debug = True + #self.debug = True extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path))) # python directories python_include_dirs = distutils.sysconfig.get_python_inc() @@ -75,7 +75,7 @@ class CMakeBuild(build_ext): '-DLLVM_CONFIG=' + find_llvm()] # configuration cfg = 'Debug' if self.debug else 'Release' - cfg = 'Debug' + cfg = 'Release' build_args = ['--config', cfg] if platform.system() == "Windows":