1 #ifndef VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
2 #define VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
47 typedef std::vector<std::pair<const char *, viennacl::ocl::handle<cl_mem> > > temporaries_type;
49 static void fill_scalartypes(
statements_type statements, std::vector<const char *> & res){
50 res.reserve(statements.size());
51 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
54 switch(it->second.lhs.numeric_type){
56 res.push_back(
"float");
59 res.push_back(
"double");
80 if(temporaries_.empty()){
82 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
85 const char * scalartype_name;
87 switch(array[0].lhs.numeric_type){
90 default:
throw "not implemented";
92 for(scheduler::statement::container_type::const_iterator iit = array.begin() ; iit != array.end() ; ++iit){
94 temporaries_.push_back(std::make_pair(scalartype_name,
viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast<unsigned int>(num_groups_*size_of_scalartype))));
103 for(scheduler::statement::container_type::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
126 assert(
false &&
bool(
"unexpected expression tree"));
140 return "Vec,LSize,NumGroups,GlobalDecomposition";
144 std::ostringstream oss;
147 <<
"," << num_groups_
148 <<
"," << decomposition_;
179 set_size_argument(statements.front().first, statements.front().second, n_arg, k);
180 for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
181 k.
arg(n_arg++, it->second);
187 arguments_string += detail::generate_value_kernel_argument(
"unsigned int",
"N");
188 for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
189 arguments_string += detail::generate_pointer_kernel_argument(
"__global", it->first,
"temp" +
utils::to_string(std::distance(temporaries_.begin(), it)));
195 void core_0(
utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *>
const & scalartypes,
statements_type const & , std::vector<detail::mapping_type>
const & )
const {
197 stream <<
"unsigned int lid = get_local_id(0);" << std::endl;
199 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
200 stream << scalartypes[k] <<
" sum" << k <<
" = 0;" << std::endl;
203 stream <<
"for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0)){" << std::endl;
206 stream <<
"unsigned int chunk_size = (N + get_num_groups(0)-1)/get_num_groups(0);" << std::endl;
207 stream <<
"unsigned int chunk_start = get_group_id(0)*chunk_size;" << std::endl;
208 stream <<
"unsigned int chunk_end = min(chunk_start+chunk_size, N);" << std::endl;
209 stream <<
"for(unsigned int i = chunk_start + get_local_id(0) ; i < chunk_end ; i += get_local_size(0)){" << std::endl;
214 std::set<std::string> fetched;
216 for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
219 detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair(
"i",
"0"),
vector_size_,stream,(*it)->mapping());
220 detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair(
"i",
"0"),
vector_size_,stream,(*it)->mapping());
225 for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
231 detail::generate_all_lhs(statement,root_node,std::make_pair(
"i",
"0"),a,str,(*it)->mapping());
233 detail::generate_all_rhs(statement,root_node,std::make_pair(
"i",
"0"),a,str,(*it)->mapping());
234 stream <<
" sum" << std::distance(exprs.begin(),it) <<
" += " << str <<
";" << std::endl;
239 detail::generate_all_lhs(statement,root_node,std::make_pair(
"i",
"0"),-1,str,(*it)->mapping());
241 detail::generate_all_rhs(statement,root_node,std::make_pair(
"i",
"0"),-1,str,(*it)->mapping());
242 stream <<
" sum" << std::distance(exprs.begin(),it) <<
" += " << str <<
";" << std::endl;
248 stream <<
"}" << std::endl;
250 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
251 stream <<
"__local " << scalartypes[k] <<
" buf" << k <<
"[" <<
local_size_1_ <<
"];" << std::endl;
253 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
254 stream <<
"buf" << k <<
"[lid] = sum" << k <<
";" << std::endl;
258 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
259 stream <<
"if(lid < " <<
stride <<
"){" << std::endl;
261 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k){
262 stream <<
"buf" << k <<
"[lid] += buf" << k <<
"[lid + " <<
stride <<
"];" << std::endl;
265 stream <<
"}" << std::endl;
269 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
270 stream <<
"if(lid==0){" << std::endl;
272 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
273 stream <<
"buf" << k <<
"[0] += buf" << k <<
"[1];" << std::endl;
275 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
276 stream <<
"temp"<< k <<
"[get_group_id(0)] = buf" << k <<
"[0];" << std::endl;
279 stream <<
"}" << std::endl;
283 void core_1(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> scalartypes,
statements_type const & statements, std::vector<detail::mapping_type>
const & mapping)
const {
284 stream <<
"unsigned int lid = get_local_id(0);" << std::endl;
286 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
287 stream <<
"__local " << scalartypes[k] <<
" buf" << k <<
"[" <<
local_size_1_ <<
"];" << std::endl;
289 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
290 stream << scalartypes[0] <<
" sum" << k <<
" = 0;" << std::endl;
292 stream <<
"for(unsigned int i = lid ; i < " << num_groups_ <<
" ; i += get_local_size(0)){" << std::endl;
294 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
295 stream <<
"sum" << k <<
" += temp" << k <<
"[i];" << std::endl;
297 stream <<
"}" << std::endl;
299 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k)
300 stream <<
"buf" << k <<
"[lid] = sum" << k <<
";" << std::endl;
304 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
305 stream <<
"if(lid < " <<
stride <<
"){" << std::endl;
307 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k){
308 stream <<
"buf" << k <<
"[lid] += buf" << k <<
"[lid + " <<
stride <<
"];" << std::endl;
311 stream <<
"}" << std::endl;
314 stream <<
"barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
315 stream <<
"if(lid==0){" << std::endl;
317 for(
vcl_size_t k = 0 ; k < exprs.size() ; ++k){
318 stream <<
"buf" << k <<
"[0] += buf" << k <<
"[1];" << std::endl;
323 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
325 detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair(
"0",
"0"), -1, str, mapping[i++]),
false);
326 stream << str <<
";" << std::endl;
330 stream <<
"}" << std::endl;
333 void core(
vcl_size_t kernel_id, utils::kernel_generation_stream& stream,
statements_type const & statements, std::vector<detail::mapping_type>
const & mapping)
const {
334 std::vector<detail::mapped_scalar_reduction*> exprs;
335 for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
336 for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit)
337 if(detail::mapped_scalar_reduction * p = dynamic_cast<detail::mapped_scalar_reduction*>(iit->second.get()))
340 std::vector<const char *> scalartypes;
341 fill_scalartypes(statements, scalartypes);
344 core_0(stream,exprs,scalartypes,statements,mapping);
347 core_1(stream,exprs,scalartypes,statements,mapping);
352 unsigned int num_groups_;
353 unsigned int decomposition_;
354 mutable temporaries_type temporaries_;
A stream class where the kernel sources are streamed to. Takes care of indentation of the sources...
Definition: utils.hpp:233
void arg(unsigned int pos, cl_char val)
Sets a char argument at the provided position.
Definition: kernel.hpp:124
std::size_t vcl_size_t
Definition: forwards.h:58
vcl_size_t node_index
Definition: forwards.h:276
statement(container_type const &custom_array)
Definition: forwards.h:454
Internal utils for a dynamic OpenCL kernel generation.
void kernel_arguments(statements_type const &statements, std::string &arguments_string) const
Definition: scalar_reduction.hpp:185
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:59
Base class for an operation profile.
Definition: profile_base.hpp:47
lhs_rhs_element lhs
Definition: forwards.h:422
Definition: forwards.h:217
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:46
std::list< std::pair< scheduler::statement, scheduler::statement_node > > statements_type
Definition: profile_base.hpp:49
lhs_rhs_element rhs
Definition: forwards.h:424
std::string csv_representation() const
csv representation of an operation
Definition: scalar_reduction.hpp:143
Main namespace in ViennaCL. Holds all the basic types such as vector, matrix, etc. and defines operations upon them.
Definition: cpu_ram.hpp:29
several code generation helpers
Definition: forwards.h:170
Base classes for the profiles.
vcl_size_t lmem_used(vcl_size_t scalartype_size) const
Definition: scalar_reduction.hpp:75
static std::string csv_format()
Definition: scalar_reduction.hpp:139
void set_size_argument(viennacl::scheduler::statement const &s, viennacl::scheduler::statement_node const &, unsigned int &n_arg, viennacl::ocl::kernel &k) const
Definition: scalar_reduction.hpp:101
void init_temporaries(statements_type const &statements) const
Definition: scalar_reduction.hpp:79
void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const &statements, viennacl::ocl::kernel &k, unsigned int &n_arg) const
Configures the range and enqueues the arguments associated with the profile.
Definition: scalar_reduction.hpp:158
viennacl::ocl::context & current_context()
Convenience function for returning the current context.
Definition: backend.hpp:192
Definition: forwards.h:173
void configure_local_sizes(viennacl::ocl::kernel &k, vcl_size_t) const
Definition: profile_base.hpp:59
void dec_tab()
Definition: utils.hpp:259
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
std::vector< value_type > container_type
Definition: forwards.h:452
unsigned int vector_size_
Definition: profile_base.hpp:178
unsigned int vector_size() const
Get the vector size of the kernel.
Definition: profile_base.hpp:90
scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition)
The user constructor.
Definition: scalar_reduction.hpp:136
container_type const & array() const
Definition: forwards.h:473
unsigned int decomposition() const
Definition: scalar_reduction.hpp:155
void inc_tab()
Definition: utils.hpp:257
std::string to_string(T const t)
Definition: utils.hpp:204
Definition: forwards.h:101
statement_node_type_family type_family
Definition: forwards.h:269
The main class for representing a statement such as x = inner_prod(y,z); at runtime.
Definition: forwards.h:447
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:759
Implementations for the OpenCL backend functionality.
vcl_size_t local_size_1_
Definition: profile_base.hpp:179
Definition: forwards.h:216
Main datastructure for an node in the statement tree.
Definition: forwards.h:420
Functor for returning the internal size of a vector.
Definition: utils.hpp:167
unsigned int num_groups() const
Definition: scalar_reduction.hpp:152
OpenCL kernel generation template for scalar reduction operations such as s = norm_2(x).
Definition: scalar_reduction.hpp:45