intrin-wrapper.hpp 175 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070
  1. #ifndef _SCTL_INTRIN_WRAPPER_HPP_
  2. #define _SCTL_INTRIN_WRAPPER_HPP_
  3. #include <sctl/common.hpp>
  4. #include SCTL_INCLUDE(math_utils.hpp)
  5. #ifdef _MSC_VER
  6. #include <intrin.h>
  7. #else
  8. #include <x86intrin.h>
  9. #endif
  10. // TODO: Check alignment when SCTL_MEMDEBUG is defined
  11. // TODO: Replace pointers with iterators
  12. namespace SCTL_NAMESPACE { // Traits
  13. enum class DataType {
  14. Integer,
  15. Real
  16. };
  17. template <class ValueType> class TypeTraits {
  18. };
  19. template <> class TypeTraits<int8_t> {
  20. public:
  21. static constexpr DataType Type = DataType::Integer;
  22. static constexpr Integer Size = sizeof(int8_t);
  23. static constexpr Integer SigBits = Size * 8 - 1;
  24. };
  25. template <> class TypeTraits<int16_t> {
  26. public:
  27. static constexpr DataType Type = DataType::Integer;
  28. static constexpr Integer Size = sizeof(int16_t);
  29. static constexpr Integer SigBits = Size * 8 - 1;
  30. };
  31. template <> class TypeTraits<int32_t> {
  32. public:
  33. static constexpr DataType Type = DataType::Integer;
  34. static constexpr Integer Size = sizeof(int32_t);
  35. static constexpr Integer SigBits = Size * 8 - 1;
  36. };
  37. template <> class TypeTraits<int64_t> {
  38. public:
  39. static constexpr DataType Type = DataType::Integer;
  40. static constexpr Integer Size = sizeof(int64_t);
  41. static constexpr Integer SigBits = Size * 8 - 1;
  42. };
  43. #ifdef __SIZEOF_INT128__
  44. template <> class TypeTraits<__int128> {
  45. public:
  46. static constexpr DataType Type = DataType::Integer;
  47. static constexpr Integer Size = sizeof(__int128);
  48. static constexpr Integer SigBits = Size * 16 - 1;
  49. };
  50. #endif
  51. template <> class TypeTraits<float> {
  52. public:
  53. static constexpr DataType Type = DataType::Real;
  54. static constexpr Integer Size = sizeof(float);
  55. static constexpr Integer SigBits = 23;
  56. };
  57. template <> class TypeTraits<double> {
  58. public:
  59. static constexpr DataType Type = DataType::Real;
  60. static constexpr Integer Size = sizeof(double);
  61. static constexpr Integer SigBits = 52;
  62. };
  63. template <> class TypeTraits<long double> {
  64. public:
  65. static constexpr DataType Type = DataType::Real;
  66. static constexpr Integer Size = sizeof(long double);
  67. static constexpr Integer SigBits = significant_bits<long double>();
  68. };
  69. #ifdef SCTL_QUAD_T
  70. template <> class TypeTraits<QuadReal> {
  71. public:
  72. static constexpr DataType Type = DataType::Real;
  73. static constexpr Integer Size = sizeof(QuadReal);
  74. static constexpr Integer SigBits = 112;
  75. };
  76. #endif
  77. template <Integer N> struct IntegerType {};
  78. template <> struct IntegerType<sizeof( int8_t)> { using value = int8_t; };
  79. template <> struct IntegerType<sizeof(int16_t)> { using value = int16_t; };
  80. template <> struct IntegerType<sizeof(int32_t)> { using value = int32_t; };
  81. template <> struct IntegerType<sizeof(int64_t)> { using value = int64_t; };
  82. #ifdef __SIZEOF_INT128__
  83. template <> struct IntegerType<sizeof(__int128)> { using value = __int128; };
  84. #endif
  85. }
  86. namespace SCTL_NAMESPACE { // Generic
  87. template <class ValueType, Integer N> struct alignas(sizeof(ValueType)*N>SCTL_ALIGN_BYTES?SCTL_ALIGN_BYTES:sizeof(ValueType)*N) VecData {
  88. using ScalarType = ValueType;
  89. static constexpr Integer Size = N;
  90. ScalarType v[N];
  91. };
  92. template <class VData> inline VData zero_intrin() {
  93. union {
  94. VData v;
  95. typename VData::ScalarType x[VData::Size];
  96. } a_;
  97. for (Integer i = 0; i < VData::Size; i++) a_.x[i] = (typename VData::ScalarType)0;
  98. return a_.v;
  99. }
  100. template <class VData> inline VData set1_intrin(typename VData::ScalarType a) {
  101. union {
  102. VData v;
  103. typename VData::ScalarType x[VData::Size];
  104. } a_;
  105. for (Integer i = 0; i < VData::Size; i++) a_.x[i] = a;
  106. return a_.v;
  107. }
  108. template <Integer k, Integer Size, class Data, class T> inline void SetHelper(Data& vec, T x) {
  109. vec.x[Size-k-1] = x;
  110. }
  111. template <Integer k, Integer Size, class Data, class T, class... T2> inline void SetHelper(Data& vec, T x, T2... rest) {
  112. vec.x[Size-k-1] = x;
  113. SetHelper<k-1,Size>(vec, rest...);
  114. }
  115. template <class VData, class T, class ...T2> inline VData set_intrin(T x, T2 ...args) {
  116. union {
  117. VData v;
  118. typename VData::ScalarType x[VData::Size];
  119. } vec;
  120. SetHelper<VData::Size-1,VData::Size>(vec, x, args...);
  121. return vec.v;
  122. }
  123. template <class VData> inline VData load1_intrin(typename VData::ScalarType const* p) {
  124. union {
  125. VData v;
  126. typename VData::ScalarType x[VData::Size];
  127. } vec;
  128. for (Integer i = 0; i < VData::Size; i++) vec.x[i] = p[0];
  129. return vec.v;
  130. }
  131. template <class VData> inline VData loadu_intrin(typename VData::ScalarType const* p) {
  132. union {
  133. VData v;
  134. typename VData::ScalarType x[VData::Size];
  135. } vec;
  136. for (Integer i = 0; i < VData::Size; i++) vec.x[i] = p[i];
  137. return vec.v;
  138. }
  139. template <class VData> inline VData load_intrin(typename VData::ScalarType const* p) {
  140. return loadu_intrin<VData>(p);
  141. }
  142. template <class VData> inline void storeu_intrin(typename VData::ScalarType* p, VData vec) {
  143. union {
  144. VData v;
  145. typename VData::ScalarType x[VData::Size];
  146. } vec_ = {vec};
  147. for (Integer i = 0; i < VData::Size; i++) p[i] = vec_.x[i];
  148. }
  149. template <class VData> inline void store_intrin(typename VData::ScalarType* p, VData vec) {
  150. storeu_intrin(p,vec);
  151. }
  152. template <class VData> inline void stream_store_intrin(typename VData::ScalarType* p, VData vec) {
  153. store_intrin(p,vec);
  154. }
  155. template <class VData> inline typename VData::ScalarType extract_intrin(VData vec, Integer i) {
  156. union {
  157. VData v;
  158. typename VData::ScalarType x[VData::Size];
  159. } vec_ = {vec};
  160. return vec_.x[i];
  161. }
  162. template <class VData> inline void insert_intrin(VData& vec, Integer i, typename VData::ScalarType value) {
  163. union {
  164. VData v;
  165. typename VData::ScalarType x[VData::Size];
  166. } vec_ = {vec};
  167. vec_.x[i] = value;
  168. vec = vec_.v;
  169. }
  170. // Arithmetic operators
  171. template <class VData> inline VData unary_minus_intrin(const VData& vec) {
  172. union {
  173. VData v;
  174. typename VData::ScalarType x[VData::Size];
  175. } vec_ = {vec};
  176. for (Integer i = 0; i < VData::Size; i++) vec_.x[i] = -vec_.x[i];
  177. return vec_.v;
  178. }
  179. template <class VData> inline VData mul_intrin(const VData& a, const VData& b) {
  180. union U {
  181. VData v;
  182. typename VData::ScalarType x[VData::Size];
  183. };
  184. U a_ = {a};
  185. U b_ = {b};
  186. for (Integer i = 0; i < VData::Size; i++) a_.x[i] *= b_.x[i];
  187. return a_.v;
  188. }
  189. template <class VData> inline VData div_intrin(const VData& a, const VData& b) {
  190. union U {
  191. VData v;
  192. typename VData::ScalarType x[VData::Size];
  193. };
  194. U a_ = {a};
  195. U b_ = {b};
  196. for (Integer i = 0; i < VData::Size; i++) a_.x[i] /= b_.x[i];
  197. return a_.v;
  198. }
  199. template <class VData> inline VData add_intrin(const VData& a, const VData& b) {
  200. union U {
  201. VData v;
  202. typename VData::ScalarType x[VData::Size];
  203. };
  204. U a_ = {a};
  205. U b_ = {b};
  206. for (Integer i = 0; i < VData::Size; i++) a_.x[i] += b_.x[i];
  207. return a_.v;
  208. }
  209. template <class VData> inline VData sub_intrin(const VData& a, const VData& b) {
  210. union U {
  211. VData v;
  212. typename VData::ScalarType x[VData::Size];
  213. };
  214. U a_ = {a};
  215. U b_ = {b};
  216. for (Integer i = 0; i < VData::Size; i++) a_.x[i] -= b_.x[i];
  217. return a_.v;
  218. }
  219. template <class VData> inline VData fma_intrin(const VData& a, const VData& b, const VData& c) {
  220. return add_intrin(mul_intrin(a,b), c);
  221. }
  222. // Bitwise operators
  223. template <class VData> inline VData not_intrin(const VData& vec) {
  224. static constexpr Integer N = VData::Size*sizeof(typename VData::ScalarType);
  225. union {
  226. VData v;
  227. int8_t x[N];
  228. } vec_ = {vec};
  229. for (Integer i = 0; i < N; i++) vec_.x[i] = ~vec_.x[i];
  230. return vec_.v;
  231. }
  232. template <class VData> inline VData and_intrin(const VData& a, const VData& b) {
  233. static constexpr Integer N = VData::Size*sizeof(typename VData::ScalarType);
  234. union U {
  235. VData v;
  236. int8_t x[N];
  237. };
  238. U a_ = {a};
  239. U b_ = {b};
  240. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] & b_.x[i];
  241. return a_.v;
  242. }
  243. template <class VData> inline VData xor_intrin(const VData& a, const VData& b) {
  244. static constexpr Integer N = VData::Size*sizeof(typename VData::ScalarType);
  245. union U {
  246. VData v;
  247. int8_t x[N];
  248. };
  249. U a_ = {a};
  250. U b_ = {b};
  251. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] ^ b_.x[i];
  252. return a_.v;
  253. }
  254. template <class VData> inline VData or_intrin(const VData& a, const VData& b) {
  255. static constexpr Integer N = VData::Size*sizeof(typename VData::ScalarType);
  256. union U {
  257. VData v;
  258. int8_t x[N];
  259. };
  260. U a_ = {a};
  261. U b_ = {b};
  262. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] | b_.x[i];
  263. return a_.v;
  264. }
  265. template <class VData> inline VData andnot_intrin(const VData& a, const VData& b) {
  266. static constexpr Integer N = VData::Size*sizeof(typename VData::ScalarType);
  267. union U {
  268. VData v;
  269. int8_t x[N];
  270. };
  271. U a_ = {a};
  272. U b_ = {b};
  273. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] & (~b_.x[i]);
  274. return a_.v;
  275. }
  276. // Bitshift
  277. template <class VData> inline VData bitshiftleft_intrin(const VData& a, const Integer& rhs) {
  278. static constexpr Integer N = VData::Size;
  279. union {
  280. VData v;
  281. typename VData::ScalarType x[N];
  282. } a_ = {a};
  283. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] << rhs;
  284. return a_.v;
  285. }
  286. template <class VData> inline VData bitshiftright_intrin(const VData& a, const Integer& rhs) {
  287. static constexpr Integer N = VData::Size;
  288. union {
  289. VData v;
  290. typename VData::ScalarType x[N];
  291. } a_ = {a};
  292. for (Integer i = 0; i < N; i++) a_.x[i] = a_.x[i] >> rhs;
  293. return a_.v;
  294. }
  295. // Other functions
  296. template <class VData> inline VData max_intrin(const VData& a, const VData& b) {
  297. union U {
  298. VData v;
  299. typename VData::ScalarType x[VData::Size];
  300. };
  301. U a_ = {a};
  302. U b_ = {b};
  303. for (Integer i = 0; i < VData::Size; i++) a_.x[i] = (a_.x[i] < b_.x[i] ? b_.x[i] : a_.x[i]);
  304. return a_.v;
  305. }
  306. template <class VData> inline VData min_intrin(const VData& a, const VData& b) {
  307. union U {
  308. VData v;
  309. typename VData::ScalarType x[VData::Size];
  310. };
  311. U a_ = {a};
  312. U b_ = {b};
  313. for (Integer i = 0; i < VData::Size; i++) a_.x[i] = (a_.x[i] < b_.x[i] ? a_.x[i] : b_.x[i]);
  314. return a_.v;
  315. }
  316. // Conversion operators
  317. template <class RetType, class ValueType, Integer N> inline RetType reinterpret_intrin(const VecData<ValueType,N>& v) {
  318. static_assert(sizeof(RetType) == sizeof(VecData<ValueType,N>), "Illegal type cast -- size of types does not match.");
  319. union {
  320. VecData<ValueType,N> v;
  321. RetType r;
  322. } u = {v};
  323. return u.r;
  324. }
  325. template <class RealVec, class IntVec> inline RealVec convert_int2real_intrin(const IntVec& x) {
  326. using Real = typename RealVec::ScalarType;
  327. using Int = typename IntVec::ScalarType;
  328. static_assert(TypeTraits<Real>::Type == DataType::Real, "Expected real type!");
  329. static_assert(TypeTraits<Int>::Type == DataType::Integer, "Expected integer type!");
  330. static_assert(sizeof(RealVec) == sizeof(IntVec) && sizeof(Real) == sizeof(Int), "Real and integer types must have same size!");
  331. static constexpr Integer SigBits = TypeTraits<Real>::SigBits;
  332. union {
  333. Int Cint = (((Int)1) << (SigBits - 1)) + ((SigBits + ((((Int)1)<<(sizeof(Real)*8 - SigBits - 2))-1)) << SigBits);
  334. Real Creal;
  335. };
  336. IntVec l(add_intrin(x, set1_intrin<IntVec>(Cint)));
  337. return sub_intrin(reinterpret_intrin<RealVec>(l), set1_intrin<RealVec>(Creal));
  338. }
  339. template <class IntVec, class RealVec> inline IntVec round_real2int_intrin(const RealVec& x) {
  340. using Int = typename IntVec::ScalarType;
  341. using Real = typename RealVec::ScalarType;
  342. static_assert(TypeTraits<Real>::Type == DataType::Real, "Expected real type!");
  343. static_assert(TypeTraits<Int>::Type == DataType::Integer, "Expected integer type!");
  344. static_assert(sizeof(RealVec) == sizeof(IntVec) && sizeof(Real) == sizeof(Int), "Real and integer types must have same size!");
  345. static constexpr Integer SigBits = TypeTraits<Real>::SigBits;
  346. union {
  347. Int Cint = (((Int)1) << (SigBits - 1)) + ((SigBits + ((((Int)1)<<(sizeof(Real)*8 - SigBits - 2))-1)) << SigBits);
  348. Real Creal;
  349. };
  350. RealVec d(add_intrin(x, set1_intrin<RealVec>(Creal)));
  351. return sub_intrin(reinterpret_intrin<IntVec>(d), set1_intrin<IntVec>(Cint));
  352. }
  353. template <class VData> inline VData round_real2real_intrin(const VData& x) {
  354. using Real = typename VData::ScalarType;
  355. using Int = typename IntegerType<sizeof(Real)>::value;
  356. static_assert(TypeTraits<Real>::Type == DataType::Real, "Expected real type!");
  357. static constexpr Integer SigBits = TypeTraits<Real>::SigBits;
  358. union {
  359. Int Cint = (((Int)1) << (SigBits - 1)) + ((SigBits + ((((Int)1)<<(sizeof(Real)*8 - SigBits - 2))-1)) << SigBits);
  360. Real Creal;
  361. };
  362. VData Vreal(set1_intrin<VData>(Creal));
  363. return sub_intrin(add_intrin(x, Vreal), Vreal);
  364. }
  365. /////////////////////////////////////////////////////////////////////////////
  366. /////////////////////////////////////////////////////////////////////////////
  367. template <class VData> struct Mask : public VData {
  368. using VDataType = VData;
  369. static inline Mask Zero() {
  370. return Mask<VData>(zero_intrin<VDataType>());
  371. }
  372. Mask() = default;
  373. Mask(const Mask&) = default;
  374. Mask& operator=(const Mask&) = default;
  375. ~Mask() = default;
  376. inline explicit Mask(const VData& v) : VData(v) {}
  377. };
  378. template <class RetType, class VData> inline RetType reinterpret_mask(const Mask<VData>& v) {
  379. static_assert(sizeof(RetType) == sizeof(Mask<VData>), "Illegal type cast -- size of types does not match.");
  380. union {
  381. Mask<VData> v;
  382. RetType r;
  383. } u = {v};
  384. return u.r;
  385. }
  386. // Bitwise operators
  387. template <class VData> inline Mask<VData> operator~(const Mask<VData>& vec) {
  388. return Mask<VData>(not_intrin(vec));
  389. }
  390. template <class VData> inline Mask<VData> operator&(const Mask<VData>& a, const Mask<VData>& b) {
  391. return Mask<VData>(and_intrin(a,b));
  392. }
  393. template <class VData> inline Mask<VData> operator^(const Mask<VData>& a, const Mask<VData>& b) {
  394. return Mask<VData>(xor_intrin(a,b));
  395. }
  396. template <class VData> inline Mask<VData> operator|(const Mask<VData>& a, const Mask<VData>& b) {
  397. return Mask<VData>(or_intrin(a,b));
  398. }
  399. template <class VData> inline Mask<VData> AndNot(const Mask<VData>& a, const Mask<VData>& b) {
  400. return Mask<VData>(andnot_intrin(a,b));
  401. }
  402. template <class VData> inline VData convert_mask2vec_intrin(const Mask<VData>& v) {
  403. return v;
  404. }
  405. template <class VData> inline Mask<VData> convert_vec2mask_intrin(const VData& v) {
  406. return Mask<VData>(v);
  407. }
  408. /////////////////////////////////////////////////////////////////////////////
  409. /////////////////////////////////////////////////////////////////////////////
  410. // Comparison operators
  411. enum class ComparisonType { lt, le, gt, ge, eq, ne};
  412. template <ComparisonType TYPE, class VData> inline Mask<VData> comp_intrin(const VData& a, const VData& b) {
  413. static_assert(sizeof(Mask<VData>) == sizeof(VData), "Invalid operation on Mask");
  414. using ScalarType = typename VData::ScalarType;
  415. using IntType = typename IntegerType<sizeof(ScalarType)>::value;
  416. union U {
  417. VData v;
  418. Mask<VData> m;
  419. ScalarType x[VData::Size];
  420. IntType q[VData::Size];
  421. };
  422. U a_ = {a};
  423. U b_ = {b};
  424. U c_;
  425. static constexpr IntType zero_const = (IntType)0;
  426. static constexpr IntType one_const =~(IntType)0;
  427. if (TYPE == ComparisonType::lt) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] < b_.x[i] ? one_const : zero_const);
  428. if (TYPE == ComparisonType::le) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] <= b_.x[i] ? one_const : zero_const);
  429. if (TYPE == ComparisonType::gt) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] > b_.x[i] ? one_const : zero_const);
  430. if (TYPE == ComparisonType::ge) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] >= b_.x[i] ? one_const : zero_const);
  431. if (TYPE == ComparisonType::eq) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] == b_.x[i] ? one_const : zero_const);
  432. if (TYPE == ComparisonType::ne) for (Integer i = 0; i < VData::Size; i++) c_.q[i] = (a_.x[i] != b_.x[i] ? one_const : zero_const);
  433. return c_.m;
  434. }
  435. template <class VData> inline VData select_intrin(const Mask<VData>& s, const VData& a, const VData& b) {
  436. static_assert(sizeof(Mask<VData>) == sizeof(VData), "Invalid operation on Mask");
  437. union U {
  438. Mask<VData> m;
  439. VData v;
  440. } s_ = {s};
  441. return or_intrin(and_intrin(a,s_.v), andnot_intrin(b,s_.v));
  442. }
  443. // Special funtions
  444. template <Integer MAX_ITER, Integer ITER, class VData> struct rsqrt_newton_iter {
  445. static inline VData eval(const VData& y, const VData& x) {
  446. using ValueType = typename VData::ScalarType;
  447. constexpr ValueType c1 = -3 * pow<pow<MAX_ITER-ITER>(3)-1,ValueType>(2);
  448. return rsqrt_newton_iter<MAX_ITER,ITER-1,VData>::eval(mul_intrin(y, fma_intrin(x,mul_intrin(y,y),set1_intrin<VData>(c1))), x);
  449. }
  450. };
  451. template <Integer MAX_ITER, class VData> struct rsqrt_newton_iter<MAX_ITER,1,VData> {
  452. static inline VData eval(const VData& y, const VData& x) {
  453. using ValueType = typename VData::ScalarType;
  454. constexpr ValueType c1 = -3 * pow<pow<MAX_ITER-1>(3)-1,ValueType>(2);
  455. constexpr ValueType c2 = pow<(pow<MAX_ITER-1>(3)-1)*3/2+1,ValueType>(-0.5);
  456. return mul_intrin(mul_intrin(y, fma_intrin(x,mul_intrin(y,y),set1_intrin<VData>(c1))), set1_intrin<VData>(c2));
  457. }
  458. };
  459. template <class VData> struct rsqrt_newton_iter<0,0,VData> {
  460. static inline VData eval(const VData& y, const VData& x) {
  461. return y;
  462. }
  463. };
  464. static inline constexpr Integer mylog2(Integer x) {
  465. return ((x<1) ? 0 : 1+mylog2(x/2));
  466. }
  467. template <Integer digits, class VData> struct rsqrt_approx_intrin {
  468. static inline VData eval(const VData& a) {
  469. union {
  470. VData v;
  471. typename VData::ScalarType x[VData::Size];
  472. } a_ = {a}, b_;
  473. for (Integer i = 0; i < VData::Size; i++) b_.x[i] = (typename VData::ScalarType)1/sqrt<float>((float)a_.x[i]);
  474. constexpr Integer newton_iter = mylog2((Integer)(digits/7.2247198959));
  475. return rsqrt_newton_iter<newton_iter,newton_iter,VData>::eval(b_.v, a);
  476. }
  477. static inline VData eval(const VData& a, const Mask<VData>& m) {
  478. return and_intrin(rsqrt_approx_intrin<digits,VData>::eval(a), convert_mask2vec_intrin(m));
  479. }
  480. };
  481. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0) {
  482. return set1_intrin<VData>(c0);
  483. }
  484. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1) {
  485. return fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0));
  486. }
  487. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2) {
  488. VData x2(mul_intrin<VData>(x1,x1));
  489. return fma_intrin(x2, set1_intrin<VData>(c2), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)));
  490. }
  491. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3) {
  492. VData x2(mul_intrin<VData>(x1,x1));
  493. return fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)));
  494. }
  495. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4) {
  496. VData x2(mul_intrin<VData>(x1,x1));
  497. VData x4(mul_intrin<VData>(x2,x2));
  498. return fma_intrin(x4, set1_intrin<VData>(c4), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0))));
  499. }
  500. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5) {
  501. VData x2(mul_intrin<VData>(x1,x1));
  502. VData x4(mul_intrin<VData>(x2,x2));
  503. return fma_intrin(x4, fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4)), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0))));
  504. }
  505. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6) {
  506. VData x2(mul_intrin<VData>(x1,x1));
  507. VData x4(mul_intrin<VData>(x2,x2));
  508. return fma_intrin(x4, fma_intrin(x2, set1_intrin<VData>(c6), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0))));
  509. }
  510. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7) {
  511. VData x2(mul_intrin<VData>(x1,x1));
  512. VData x4(mul_intrin<VData>(x2,x2));
  513. return fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0))));
  514. }
  515. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8) {
  516. VData x2(mul_intrin<VData>(x1,x1));
  517. VData x4(mul_intrin<VData>(x2,x2));
  518. VData x8(mul_intrin<VData>(x4,x4));
  519. return fma_intrin(x8, set1_intrin<VData>(c8), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  520. }
  521. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9) {
  522. VData x2(mul_intrin<VData>(x1,x1));
  523. VData x4(mul_intrin<VData>(x2,x2));
  524. VData x8(mul_intrin<VData>(x4,x4));
  525. return fma_intrin(x8, fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8)), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  526. }
  527. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10) {
  528. VData x2(mul_intrin<VData>(x1,x1));
  529. VData x4(mul_intrin<VData>(x2,x2));
  530. VData x8(mul_intrin<VData>(x4,x4));
  531. return fma_intrin(x8, fma_intrin(x2, set1_intrin<VData>(c10), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  532. }
  533. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10, const CType& c11) {
  534. VData x2(mul_intrin<VData>(x1,x1));
  535. VData x4(mul_intrin<VData>(x2,x2));
  536. VData x8(mul_intrin<VData>(x4,x4));
  537. return fma_intrin(x8, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c11), set1_intrin<VData>(c10)), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  538. }
  539. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10, const CType& c11, const CType& c12) {
  540. VData x2(mul_intrin<VData>(x1,x1));
  541. VData x4(mul_intrin<VData>(x2,x2));
  542. VData x8(mul_intrin<VData>(x4,x4));
  543. return fma_intrin(x8, fma_intrin(x4, set1_intrin<VData>(c12) , fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c11), set1_intrin<VData>(c10)), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8)))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  544. }
  545. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10, const CType& c11, const CType& c12, const CType& c13) {
  546. VData x2(mul_intrin<VData>(x1,x1));
  547. VData x4(mul_intrin<VData>(x2,x2));
  548. VData x8(mul_intrin<VData>(x4,x4));
  549. return fma_intrin(x8, fma_intrin(x4, fma_intrin(x1, set1_intrin<VData>(c13), set1_intrin<VData>(c12)) , fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c11), set1_intrin<VData>(c10)), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8)))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  550. }
  551. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10, const CType& c11, const CType& c12, const CType& c13, const CType& c14) {
  552. VData x2(mul_intrin<VData>(x1,x1));
  553. VData x4(mul_intrin<VData>(x2,x2));
  554. VData x8(mul_intrin<VData>(x4,x4));
  555. return fma_intrin(x8, fma_intrin(x4, fma_intrin(x2, set1_intrin<VData>(c14), fma_intrin(x1, set1_intrin<VData>(c13), set1_intrin<VData>(c12))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c11), set1_intrin<VData>(c10)), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8)))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  556. }
  557. template <class VData, class CType> inline VData EvalPolynomial(const VData& x1, const CType& c0, const CType& c1, const CType& c2, const CType& c3, const CType& c4, const CType& c5, const CType& c6, const CType& c7, const CType& c8, const CType& c9, const CType& c10, const CType& c11, const CType& c12, const CType& c13, const CType& c14, const CType& c15) {
  558. VData x2(mul_intrin<VData>(x1,x1));
  559. VData x4(mul_intrin<VData>(x2,x2));
  560. VData x8(mul_intrin<VData>(x4,x4));
  561. return fma_intrin(x8, fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c15), set1_intrin<VData>(c14)), fma_intrin(x1, set1_intrin<VData>(c13), set1_intrin<VData>(c12))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c11), set1_intrin<VData>(c10)), fma_intrin(x1, set1_intrin<VData>(c9), set1_intrin<VData>(c8)))), fma_intrin(x4, fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c7), set1_intrin<VData>(c6)), fma_intrin(x1, set1_intrin<VData>(c5), set1_intrin<VData>(c4))), fma_intrin(x2, fma_intrin(x1, set1_intrin<VData>(c3), set1_intrin<VData>(c2)), fma_intrin(x1, set1_intrin<VData>(c1), set1_intrin<VData>(c0)))));
  562. }
  563. template <Integer ORDER, class VData> inline void approx_sincos_intrin(VData& sinx, VData& cosx, const VData& x) {
  564. // ORDER ERROR
  565. // 1 8.81e-02
  566. // 3 2.45e-03
  567. // 5 3.63e-05
  568. // 7 3.11e-07
  569. // 9 1.75e-09
  570. // 11 6.93e-12
  571. // 13 2.09e-14
  572. // 15 6.66e-16
  573. // 17 6.66e-16
  574. using Real = typename VData::ScalarType;
  575. using Int = typename IntegerType<sizeof(Real)>::value;
  576. using IntVec = VecData<Int, VData::Size>;
  577. static_assert(TypeTraits<Real>::Type == DataType::Real, "Expected real type!");
  578. static constexpr Integer SigBits = TypeTraits<Real>::SigBits;
  579. static constexpr Real coeff1 = 1;
  580. static constexpr Real coeff3 = -1/(((Real)2)*3);
  581. static constexpr Real coeff5 = 1/(((Real)2)*3*4*5);
  582. static constexpr Real coeff7 = -1/(((Real)2)*3*4*5*6*7);
  583. static constexpr Real coeff9 = 1/(((Real)2)*3*4*5*6*7*8*9);
  584. static constexpr Real coeff11 = -1/(((Real)2)*3*4*5*6*7*8*9*10*11);
  585. static constexpr Real coeff13 = 1/(((Real)2)*3*4*5*6*7*8*9*10*11*12*13);
  586. static constexpr Real coeff15 = -1/(((Real)2)*3*4*5*6*7*8*9*10*11*12*13*14*15);
  587. static constexpr Real coeff17 = 1/(((Real)2)*3*4*5*6*7*8*9*10*11*12*13*14*15*16*17);
  588. static constexpr Real coeff19 = -1/(((Real)2)*3*4*5*6*7*8*9*10*11*12*13*14*15*16*17*18*19); // err = 2^-72.7991
  589. static constexpr Real pi_over_2 = const_pi<Real>()/2;
  590. static constexpr Real neg_pi_over_2 = -const_pi<Real>()/2;
  591. static constexpr Real inv_pi_over_2 = 1 / pi_over_2;
  592. union {
  593. Int Cint = 0 + (((Int)1) << (SigBits - 1)) + ((SigBits + ((((Int)1)<<(sizeof(Real)*8 - SigBits - 2))-1)) << SigBits);
  594. Real Creal;
  595. };
  596. VData real_offset(set1_intrin<VData>(Creal));
  597. VData x_int(fma_intrin(x, set1_intrin<VData>(inv_pi_over_2), real_offset));
  598. VData x_(sub_intrin(x_int, real_offset)); // x_ <-- round(x*inv_pi_over_2)
  599. VData x1 = fma_intrin(x_, set1_intrin<VData>(neg_pi_over_2), x);
  600. VData s1;
  601. VData x2 = mul_intrin(x1,x1);
  602. if (ORDER >= 19) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9, coeff11, coeff13, coeff15, coeff17, coeff19), x1);
  603. else if (ORDER >= 17) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9, coeff11, coeff13, coeff15, coeff17), x1);
  604. else if (ORDER >= 15) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9, coeff11, coeff13, coeff15), x1);
  605. else if (ORDER >= 13) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9, coeff11, coeff13), x1);
  606. else if (ORDER >= 11) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9, coeff11), x1);
  607. else if (ORDER >= 9) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7, coeff9), x1);
  608. else if (ORDER >= 7) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5, coeff7), x1);
  609. else if (ORDER >= 5) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3, coeff5), x1);
  610. else if (ORDER >= 3) s1 = mul_intrin(EvalPolynomial(x2, coeff1, coeff3), x1);
  611. else s1 = mul_intrin(EvalPolynomial(x2, coeff1), x1);
  612. VData cos_squared = sub_intrin(set1_intrin<VData>(1), mul_intrin(s1, s1));
  613. VData inv_cos = rsqrt_approx_intrin<ORDER,VData>::eval(cos_squared, comp_intrin<ComparisonType::ne>(cos_squared,zero_intrin<VData>()));
  614. VData c1 = mul_intrin(cos_squared, inv_cos);
  615. IntVec vec_zero(zero_intrin<IntVec>());
  616. auto xAnd1 = reinterpret_mask<Mask<VData>>(comp_intrin<ComparisonType::eq>(and_intrin(reinterpret_intrin<IntVec>(x_int), set1_intrin<IntVec>(1)), vec_zero));
  617. auto xAnd2 = reinterpret_mask<Mask<VData>>(comp_intrin<ComparisonType::eq>(and_intrin(reinterpret_intrin<IntVec>(x_int), set1_intrin<IntVec>(2)), vec_zero));
  618. VData s2(select_intrin(xAnd1, s1, c1 ));
  619. VData c2(select_intrin(xAnd1, c1, unary_minus_intrin(s1)));
  620. sinx = select_intrin(xAnd2, s2, unary_minus_intrin(s2));
  621. cosx = select_intrin(xAnd2, c2, unary_minus_intrin(c2));
  622. }
  623. template <class VData> inline void sincos_intrin(VData& sinx, VData& cosx, const VData& x) {
  624. union U {
  625. VData v;
  626. typename VData::ScalarType x[VData::Size];
  627. };
  628. U sinx_, cosx_, x_ = {x};
  629. for (Integer i = 0; i < VData::Size; i++) {
  630. sinx_.x[i] = sin(x_.x[i]);
  631. cosx_.x[i] = cos(x_.x[i]);
  632. }
  633. sinx = sinx_.v;
  634. cosx = cosx_.v;
  635. }
  636. template <Integer ORDER, class VData> inline VData approx_exp_intrin(const VData& x) {
  637. using Real = typename VData::ScalarType;
  638. using Int = typename IntegerType<sizeof(Real)>::value;
  639. using IntVec = VecData<Int, VData::Size>;
  640. static_assert(TypeTraits<Real>::Type == DataType::Real, "Expected real type!");
  641. static constexpr Int SigBits = TypeTraits<Real>::SigBits;
  642. static constexpr Real coeff2 = 1/(((Real)2));
  643. static constexpr Real coeff3 = 1/(((Real)2)*3);
  644. static constexpr Real coeff4 = 1/(((Real)2)*3*4);
  645. static constexpr Real coeff5 = 1/(((Real)2)*3*4*5);
  646. static constexpr Real coeff6 = 1/(((Real)2)*3*4*5*6);
  647. static constexpr Real coeff7 = 1/(((Real)2)*3*4*5*6*7);
  648. static constexpr Real coeff8 = 1/(((Real)2)*3*4*5*6*7*8);
  649. static constexpr Real coeff9 = 1/(((Real)2)*3*4*5*6*7*8*9);
  650. static constexpr Real coeff10 = 1/(((Real)2)*3*4*5*6*7*8*9*10);
  651. static constexpr Real coeff11 = 1/(((Real)2)*3*4*5*6*7*8*9*10*11);
  652. static constexpr Real coeff12 = 1/(((Real)2)*3*4*5*6*7*8*9*10*11*12);
  653. static constexpr Real coeff13 = 1/(((Real)2)*3*4*5*6*7*8*9*10*11*12*13); // err = 2^-57.2759
  654. static constexpr Real x0 = -(Real)0.693147180559945309417232121458l; // -ln(2)
  655. static constexpr Real invx0 = -1 / x0; // 1/ln(2)
  656. VData x_(round_real2real_intrin(mul_intrin(x, set1_intrin<VData>(invx0))));
  657. IntVec int_x_ = round_real2int_intrin<IntVec>(x_);
  658. VData x1 = fma_intrin(x_, set1_intrin<VData>(x0), x);
  659. VData e1;
  660. if (ORDER >= 13) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8, coeff9, coeff10, coeff11, coeff12, coeff13);
  661. else if (ORDER >= 12) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8, coeff9, coeff10, coeff11, coeff12);
  662. else if (ORDER >= 11) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8, coeff9, coeff10, coeff11);
  663. else if (ORDER >= 10) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8, coeff9, coeff10);
  664. else if (ORDER >= 9) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8, coeff9);
  665. else if (ORDER >= 8) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8);
  666. else if (ORDER >= 7) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7);
  667. else if (ORDER >= 6) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5, coeff6);
  668. else if (ORDER >= 5) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4, coeff5);
  669. else if (ORDER >= 4) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3, coeff4);
  670. else if (ORDER >= 3) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2, coeff3);
  671. else if (ORDER >= 2) e1 = EvalPolynomial(x1, (Real)1, (Real)1, coeff2);
  672. else if (ORDER >= 1) e1 = EvalPolynomial(x1, (Real)1, (Real)1);
  673. else if (ORDER >= 0) e1 = set1_intrin<VData>(1);
  674. VData e2;
  675. { // set e2 = 2 ^ x_
  676. union {
  677. Real real_one = 1.0;
  678. Int int_one;
  679. };
  680. IntVec int_e2 = add_intrin(set1_intrin<IntVec>(int_one), bitshiftleft_intrin(int_x_, SigBits));
  681. // Handle underflow
  682. static constexpr Int max_exp = -(Int)(((Int)1)<<((sizeof(Real)*8-SigBits-2)));
  683. e2 = reinterpret_intrin<VData>(select_intrin(comp_intrin<ComparisonType::gt>(int_x_, set1_intrin<IntVec>(max_exp)) , int_e2, zero_intrin<IntVec>()));
  684. }
  685. return mul_intrin(e1, e2);
  686. }
  687. template <class VData> inline VData exp_intrin(const VData& x) {
  688. union U {
  689. VData v;
  690. typename VData::ScalarType x[VData::Size];
  691. };
  692. U expx_, x_ = {x};
  693. for (Integer i = 0; i < VData::Size; i++) expx_.x[i] = exp(x_.x[i]);
  694. return expx_.v;
  695. }
  696. template <Integer ORDER, class VData> inline VData approx_sin_intrin(const VData& x) {
  697. VData sinx, cosx;
  698. approx_sincos_intrin<ORDER>(sinx, cosx, x);
  699. return sinx;
  700. }
  701. template <Integer ORDER, class VData> inline VData approx_cos_intrin(const VData& x) {
  702. VData sinx, cosx;
  703. approx_sincos_intrin<ORDER>(sinx, cosx, x);
  704. return cosx;
  705. }
  706. template <Integer ORDER, class VData> inline VData approx_tan_intrin(const VData& x) {
  707. //constexpr Integer digits = ORDER;
  708. VData sinx, cosx;
  709. approx_sincos_intrin<ORDER>(sinx, cosx, x);
  710. return div_intrin(sinx, cosx);
  711. //VData cos2_x = mul_intrin(cosx, cosx);
  712. //VData cos4_x = mul_intrin(cos2_x, cos2_x);
  713. //VData sec2_x = rsqrt_approx_intrin<digits,VData>::eval(cos4_x);
  714. //return mul_intrin(sinx, mul_intrin(cosx, sec2_x));
  715. }
  716. }
  717. namespace SCTL_NAMESPACE { // SSE
  718. #ifdef __SSE4_2__
  719. template <> struct alignas(sizeof(int8_t) * 16) VecData<int8_t,16> {
  720. using ScalarType = int8_t;
  721. static constexpr Integer Size = 16;
  722. VecData() = default;
  723. inline VecData(__m128i v_) : v(v_) {}
  724. __m128i v;
  725. };
  726. template <> struct alignas(sizeof(int16_t) * 8) VecData<int16_t,8> {
  727. using ScalarType = int16_t;
  728. static constexpr Integer Size = 8;
  729. VecData() = default;
  730. inline VecData(__m128i v_) : v(v_) {}
  731. __m128i v;
  732. };
  733. template <> struct alignas(sizeof(int32_t) * 4) VecData<int32_t,4> {
  734. using ScalarType = int32_t;
  735. static constexpr Integer Size = 4;
  736. VecData() = default;
  737. inline VecData(__m128i v_) : v(v_) {}
  738. __m128i v;
  739. };
  740. template <> struct alignas(sizeof(int64_t) * 2) VecData<int64_t,2> {
  741. using ScalarType = int64_t;
  742. static constexpr Integer Size = 2;
  743. VecData() = default;
  744. inline VecData(__m128i v_) : v(v_) {}
  745. __m128i v;
  746. };
  747. template <> struct alignas(sizeof(float) * 4) VecData<float,4> {
  748. using ScalarType = float;
  749. static constexpr Integer Size = 4;
  750. VecData() = default;
  751. inline VecData(__m128 v_) : v(v_) {}
  752. __m128 v;
  753. };
  754. template <> struct alignas(sizeof(double) * 2) VecData<double,2> {
  755. using ScalarType = double;
  756. static constexpr Integer Size = 2;
  757. VecData() = default;
  758. inline VecData(__m128d v_) : v(v_) {}
  759. __m128d v;
  760. };
  761. // Select between two sources, byte by byte. Used in various functions and operators
  762. // Corresponds to this pseudocode:
  763. // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
  764. // Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
  765. // The implementation depends on the instruction set:
  766. // If SSE4.1 is supported then only bit 7 in each byte of s is checked,
  767. // otherwise all bits in s are used.
  768. static inline __m128i selectb (__m128i const & s, __m128i const & a, __m128i const & b) {
  769. #if defined(__SSE4_1__)
  770. return _mm_blendv_epi8 (b, a, s);
  771. #else
  772. return _mm_or_si128(_mm_and_si128(s,a), _mm_andnot_si128(s,b));
  773. #endif
  774. }
  775. template <> inline VecData<int8_t,16> zero_intrin<VecData<int8_t,16>>() {
  776. return _mm_setzero_si128();
  777. }
  778. template <> inline VecData<int16_t,8> zero_intrin<VecData<int16_t,8>>() {
  779. return _mm_setzero_si128();
  780. }
  781. template <> inline VecData<int32_t,4> zero_intrin<VecData<int32_t,4>>() {
  782. return _mm_setzero_si128();
  783. }
  784. template <> inline VecData<int64_t,2> zero_intrin<VecData<int64_t,2>>() {
  785. return _mm_setzero_si128();
  786. }
  787. template <> inline VecData<float,4> zero_intrin<VecData<float,4>>() {
  788. return _mm_setzero_ps();
  789. }
  790. template <> inline VecData<double,2> zero_intrin<VecData<double,2>>() {
  791. return _mm_setzero_pd();
  792. }
  793. template <> inline VecData<int8_t,16> set1_intrin<VecData<int8_t,16>>(int8_t a) {
  794. return _mm_set1_epi8(a);
  795. }
  796. template <> inline VecData<int16_t,8> set1_intrin<VecData<int16_t,8>>(int16_t a) {
  797. return _mm_set1_epi16(a);
  798. }
  799. template <> inline VecData<int32_t,4> set1_intrin<VecData<int32_t,4>>(int32_t a) {
  800. return _mm_set1_epi32(a);
  801. }
  802. template <> inline VecData<int64_t,2> set1_intrin<VecData<int64_t,2>>(int64_t a) {
  803. return _mm_set1_epi64x(a);
  804. }
  805. template <> inline VecData<float,4> set1_intrin<VecData<float,4>>(float a) {
  806. return _mm_set1_ps(a);
  807. }
  808. template <> inline VecData<double,2> set1_intrin<VecData<double,2>>(double a) {
  809. return _mm_set1_pd(a);
  810. }
  811. template <> inline VecData<int8_t,16> set_intrin<VecData<int8_t,16>,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t>(int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16) {
  812. return _mm_set_epi8(v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  813. }
  814. template <> inline VecData<int16_t,8> set_intrin<VecData<int16_t,8>,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t>(int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7, int16_t v8) {
  815. return _mm_set_epi16(v8,v7,v6,v5,v4,v3,v2,v1);
  816. }
  817. template <> inline VecData<int32_t,4> set_intrin<VecData<int32_t,4>,int32_t,int32_t,int32_t,int32_t>(int32_t v1, int32_t v2, int32_t v3, int32_t v4) {
  818. return _mm_set_epi32(v4,v3,v2,v1);
  819. }
  820. template <> inline VecData<int64_t,2> set_intrin<VecData<int64_t,2>,int64_t,int64_t>(int64_t v1, int64_t v2) {
  821. return _mm_set_epi64x(v2,v1);
  822. }
  823. template <> inline VecData<float,4> set_intrin<VecData<float,4>,float,float,float,float>(float v1, float v2, float v3, float v4) {
  824. return _mm_set_ps(v4,v3,v2,v1);
  825. }
  826. template <> inline VecData<double,2> set_intrin<VecData<double,2>,double,double>(double v1, double v2) {
  827. return _mm_set_pd(v2,v1);
  828. }
  829. template <> inline VecData<int8_t,16> load1_intrin<VecData<int8_t,16>>(int8_t const* p) {
  830. return _mm_set1_epi8(p[0]);
  831. }
  832. template <> inline VecData<int16_t,8> load1_intrin<VecData<int16_t,8>>(int16_t const* p) {
  833. return _mm_set1_epi16(p[0]);
  834. }
  835. template <> inline VecData<int32_t,4> load1_intrin<VecData<int32_t,4>>(int32_t const* p) {
  836. return _mm_set1_epi32(p[0]);
  837. }
  838. template <> inline VecData<int64_t,2> load1_intrin<VecData<int64_t,2>>(int64_t const* p) {
  839. return _mm_set1_epi64x(p[0]);
  840. }
  841. template <> inline VecData<float,4> load1_intrin<VecData<float,4>>(float const* p) {
  842. return _mm_load1_ps(p);
  843. }
  844. template <> inline VecData<double,2> load1_intrin<VecData<double,2>>(double const* p) {
  845. return _mm_load1_pd(p);
  846. }
  847. template <> inline VecData<int8_t,16> loadu_intrin<VecData<int8_t,16>>(int8_t const* p) {
  848. return _mm_loadu_si128((__m128i const*)p);
  849. }
  850. template <> inline VecData<int16_t,8> loadu_intrin<VecData<int16_t,8>>(int16_t const* p) {
  851. return _mm_loadu_si128((__m128i const*)p);
  852. }
  853. template <> inline VecData<int32_t,4> loadu_intrin<VecData<int32_t,4>>(int32_t const* p) {
  854. return _mm_loadu_si128((__m128i const*)p);
  855. }
  856. template <> inline VecData<int64_t,2> loadu_intrin<VecData<int64_t,2>>(int64_t const* p) {
  857. return _mm_loadu_si128((__m128i const*)p);
  858. }
  859. template <> inline VecData<float,4> loadu_intrin<VecData<float,4>>(float const* p) {
  860. return _mm_loadu_ps(p);
  861. }
  862. template <> inline VecData<double,2> loadu_intrin<VecData<double,2>>(double const* p) {
  863. return _mm_loadu_pd(p);
  864. }
  865. template <> inline VecData<int8_t,16> load_intrin<VecData<int8_t,16>>(int8_t const* p) {
  866. return _mm_load_si128((__m128i const*)p);
  867. }
  868. template <> inline VecData<int16_t,8> load_intrin<VecData<int16_t,8>>(int16_t const* p) {
  869. return _mm_load_si128((__m128i const*)p);
  870. }
  871. template <> inline VecData<int32_t,4> load_intrin<VecData<int32_t,4>>(int32_t const* p) {
  872. return _mm_load_si128((__m128i const*)p);
  873. }
  874. template <> inline VecData<int64_t,2> load_intrin<VecData<int64_t,2>>(int64_t const* p) {
  875. return _mm_load_si128((__m128i const*)p);
  876. }
  877. template <> inline VecData<float,4> load_intrin<VecData<float,4>>(float const* p) {
  878. return _mm_load_ps(p);
  879. }
  880. template <> inline VecData<double,2> load_intrin<VecData<double,2>>(double const* p) {
  881. return _mm_load_pd(p);
  882. }
  883. template <> inline void storeu_intrin<VecData<int8_t,16>>(int8_t* p, VecData<int8_t,16> vec) {
  884. _mm_storeu_si128((__m128i*)p, vec.v);
  885. }
  886. template <> inline void storeu_intrin<VecData<int16_t,8>>(int16_t* p, VecData<int16_t,8> vec) {
  887. _mm_storeu_si128((__m128i*)p, vec.v);
  888. }
  889. template <> inline void storeu_intrin<VecData<int32_t,4>>(int32_t* p, VecData<int32_t,4> vec) {
  890. _mm_storeu_si128((__m128i*)p, vec.v);
  891. }
  892. template <> inline void storeu_intrin<VecData<int64_t,2>>(int64_t* p, VecData<int64_t,2> vec) {
  893. _mm_storeu_si128((__m128i*)p, vec.v);
  894. }
  895. template <> inline void storeu_intrin<VecData<float,4>>(float* p, VecData<float,4> vec) {
  896. _mm_storeu_ps(p, vec.v);
  897. }
  898. template <> inline void storeu_intrin<VecData<double,2>>(double* p, VecData<double,2> vec) {
  899. _mm_storeu_pd(p, vec.v);
  900. }
  901. template <> inline void store_intrin<VecData<int8_t,16>>(int8_t* p, VecData<int8_t,16> vec) {
  902. _mm_storeu_si128((__m128i*)p, vec.v);
  903. }
  904. template <> inline void store_intrin<VecData<int16_t,8>>(int16_t* p, VecData<int16_t,8> vec) {
  905. _mm_store_si128((__m128i*)p, vec.v);
  906. }
  907. template <> inline void store_intrin<VecData<int32_t,4>>(int32_t* p, VecData<int32_t,4> vec) {
  908. _mm_store_si128((__m128i*)p, vec.v);
  909. }
  910. template <> inline void store_intrin<VecData<int64_t,2>>(int64_t* p, VecData<int64_t,2> vec) {
  911. _mm_store_si128((__m128i*)p, vec.v);
  912. }
  913. template <> inline void store_intrin<VecData<float,4>>(float* p, VecData<float,4> vec) {
  914. _mm_store_ps(p, vec.v);
  915. }
  916. template <> inline void store_intrin<VecData<double,2>>(double* p, VecData<double,2> vec) {
  917. _mm_store_pd(p, vec.v);
  918. }
  919. template <> inline void stream_store_intrin<VecData<int8_t,16>>(int8_t* p, VecData<int8_t,16> vec) {
  920. _mm_stream_si128((__m128i*)p, vec.v);
  921. }
  922. template <> inline void stream_store_intrin<VecData<int16_t,8>>(int16_t* p, VecData<int16_t,8> vec) {
  923. _mm_stream_si128((__m128i*)p, vec.v);
  924. }
  925. template <> inline void stream_store_intrin<VecData<int32_t,4>>(int32_t* p, VecData<int32_t,4> vec) {
  926. _mm_stream_si128((__m128i*)p, vec.v);
  927. }
  928. template <> inline void stream_store_intrin<VecData<int64_t,2>>(int64_t* p, VecData<int64_t,2> vec) {
  929. _mm_stream_si128((__m128i*)p, vec.v);
  930. }
  931. template <> inline void stream_store_intrin<VecData<float,4>>(float* p, VecData<float,4> vec) {
  932. _mm_stream_ps(p, vec.v);
  933. }
  934. template <> inline void stream_store_intrin<VecData<double,2>>(double* p, VecData<double,2> vec) {
  935. _mm_stream_pd(p, vec.v);
  936. }
  937. #if defined(__AVX512VBMI2__)
  938. template <> inline int8_t extract_intrin<VecData<int8_t,16>>(VecData<int8_t,16> vec, Integer i) {
  939. __m128i x = _mm_maskz_compress_epi8(__mmask16(1u<<i), vec.v);
  940. return (int8_t)_mm_cvtsi128_si32(x);
  941. }
  942. template <> inline int16_t extract_intrin<VecData<int16_t,8>>(VecData<int16_t,8> vec, Integer i) {
  943. __m128i x = _mm_maskz_compress_epi16(__mmask8(1u<<i), vec.v);
  944. return (int16_t)_mm_cvtsi128_si32(x);
  945. }
  946. template <> inline int32_t extract_intrin<VecData<int32_t,4>>(VecData<int32_t,4> vec, Integer i) {
  947. __m128i x = _mm_maskz_compress_epi32(__mmask8(1u<<i), vec.v);
  948. return (int32_t)_mm_cvtsi128_si32(x);
  949. }
  950. //template <> inline int64_t extract_intrin<VecData<int64_t,2>>(VecData<int64_t,2> vec, Integer i) {}
  951. template <> inline float extract_intrin<VecData<float,4>>(VecData<float,4> vec, Integer i) {
  952. __m128 x = _mm_maskz_compress_ps(__mmask8(1u<<i), vec.v);
  953. return _mm_cvtss_f32(x);
  954. }
  955. template <> inline double extract_intrin<VecData<double,2>>(VecData<double,2> vec, Integer i) {
  956. __m128d x = _mm_mask_unpackhi_pd(vec.v, __mmask8(i), vec.v, vec.v);
  957. return _mm_cvtsd_f64(x);
  958. }
  959. #endif
  960. #if defined(__AVX512BW__) && defined(__AVX512VL__)
  961. template <> inline void insert_intrin<VecData<int8_t,16>>(VecData<int8_t,16>& vec, Integer i, int8_t value) {
  962. vec.v = _mm_mask_set1_epi8(vec.v, __mmask16(1u<<i), value);
  963. }
  964. template <> inline void insert_intrin<VecData<int16_t,8>>(VecData<int16_t,8>& vec, Integer i, int16_t value) {
  965. vec.v = _mm_mask_set1_epi16(vec.v, __mmask8(1u<<i), value);
  966. }
  967. #endif
  968. #if defined(__AVX512F__) && defined(__AVX512VL__)
  969. template <> inline void insert_intrin<VecData<int32_t,4>>(VecData<int32_t,4>& vec, Integer i, int32_t value) {
  970. vec.v = _mm_mask_set1_epi32(vec.v, __mmask8(1u<<i), value);
  971. }
  972. template <> inline void insert_intrin<VecData<int64_t,2>>(VecData<int64_t,2>& vec, Integer i, int64_t value) {
  973. vec.v = _mm_mask_set1_epi64(vec.v, __mmask8(1u<<i), value);
  974. }
  975. template <> inline void insert_intrin<VecData<float,4>>(VecData<float,4>& vec, Integer i, float value) {
  976. vec.v = _mm_mask_broadcastss_ps(vec.v, __mmask8(1u<<i), _mm_set_ss(value));
  977. }
  978. template <> inline void insert_intrin<VecData<double,2>>(VecData<double,2>& vec, Integer i, double value) {
  979. vec.v = _mm_mask_movedup_pd(vec.v, __mmask8(1u<<i), _mm_set_sd(value));
  980. }
  981. #endif
  982. // Arithmetic operators
  983. template <> inline VecData<int8_t,16> unary_minus_intrin<VecData<int8_t,16>>(const VecData<int8_t,16>& a) {
  984. return _mm_sub_epi8(_mm_setzero_si128(), a.v);
  985. }
  986. template <> inline VecData<int16_t,8> unary_minus_intrin<VecData<int16_t,8>>(const VecData<int16_t,8>& a) {
  987. return _mm_sub_epi16(_mm_setzero_si128(), a.v);
  988. }
  989. template <> inline VecData<int32_t,4> unary_minus_intrin<VecData<int32_t,4>>(const VecData<int32_t,4>& a) {
  990. return _mm_sub_epi32(_mm_setzero_si128(), a.v);
  991. }
  992. template <> inline VecData<int64_t,2> unary_minus_intrin<VecData<int64_t,2>>(const VecData<int64_t,2>& a) {
  993. return _mm_sub_epi64(_mm_setzero_si128(), a.v);
  994. }
  995. template <> inline VecData<float,4> unary_minus_intrin<VecData<float,4>>(const VecData<float,4>& a) {
  996. return _mm_xor_ps(a.v, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
  997. }
  998. template <> inline VecData<double,2> unary_minus_intrin<VecData<double,2>>(const VecData<double,2>& a) {
  999. return _mm_xor_pd(a.v, _mm_castsi128_pd(_mm_setr_epi32(0,0x80000000,0,0x80000000)));
  1000. }
  1001. template <> inline VecData<int8_t,16> mul_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1002. // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
  1003. __m128i aodd = _mm_srli_epi16(a.v,8); // odd numbered elements of a
  1004. __m128i bodd = _mm_srli_epi16(b.v,8); // odd numbered elements of b
  1005. __m128i muleven = _mm_mullo_epi16(a.v,b.v); // product of even numbered elements
  1006. __m128i mulodd = _mm_mullo_epi16(aodd,bodd); // product of odd numbered elements
  1007. mulodd = _mm_slli_epi16(mulodd,8); // put odd numbered elements back in place
  1008. #if defined(__AVX512VL__) && defined(__AVX512BW__)
  1009. return _mm_mask_mov_epi8(mulodd, 0x5555, muleven);
  1010. #else
  1011. __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions
  1012. return selectb(mask,muleven,mulodd); // interleave even and odd
  1013. #endif
  1014. }
  1015. template <> inline VecData<int16_t,8> mul_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1016. return _mm_mullo_epi16(a.v, b.v);
  1017. }
  1018. template <> inline VecData<int32_t,4> mul_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1019. #if defined(__SSE4_1__)
  1020. return _mm_mullo_epi32(a.v, b.v);
  1021. #else
  1022. __m128i a13 = _mm_shuffle_epi32(a.v, 0xF5); // (-,a3,-,a1)
  1023. __m128i b13 = _mm_shuffle_epi32(b.v, 0xF5); // (-,b3,-,b1)
  1024. __m128i prod02 = _mm_mul_epu32(a.v, b.v); // (-,a2*b2,-,a0*b0)
  1025. __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
  1026. __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0)
  1027. __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2)
  1028. return _mm_unpacklo_epi64(prod01,prod23); // (ab3,ab2,ab1,ab0)
  1029. #endif
  1030. }
  1031. template <> inline VecData<int64_t,2> mul_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1032. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1033. return _mm_mullo_epi64(a.v, b.v);
  1034. #elif defined(__SSE4_1__)
  1035. // Split into 32-bit multiplies
  1036. __m128i bswap = _mm_shuffle_epi32(b.v,0xB1); // b0H,b0L,b1H,b1L (swap H<->L)
  1037. __m128i prodlh = _mm_mullo_epi32(a.v,bswap); // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products
  1038. __m128i zero = _mm_setzero_si128(); // 0
  1039. __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
  1040. __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
  1041. __m128i prodll = _mm_mul_epu32(a.v,b.v); // a0Lb0L,a1Lb1L, 64 bit unsigned products
  1042. __m128i prod = _mm_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
  1043. return prod;
  1044. #else // SSE2
  1045. union U {
  1046. VData v;
  1047. typename VData::ScalarType x[VData::Size];
  1048. };
  1049. U a_ = {a};
  1050. U b_ = {b};
  1051. for (Integer i = 0; i < VData::Size; i++) a_.x[i] *= b_.x[i];
  1052. return a_.v;
  1053. #endif
  1054. }
  1055. template <> inline VecData<float,4> mul_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1056. return _mm_mul_ps(a.v, b.v);
  1057. }
  1058. template <> inline VecData<double,2> mul_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1059. return _mm_mul_pd(a.v, b.v);
  1060. }
  1061. template <> inline VecData<float,4> div_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1062. return _mm_div_ps(a.v, b.v);
  1063. }
  1064. template <> inline VecData<double,2> div_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1065. return _mm_div_pd(a.v, b.v);
  1066. }
  1067. template <> inline VecData<int8_t,16> add_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1068. return _mm_add_epi8(a.v, b.v);
  1069. }
  1070. template <> inline VecData<int16_t,8> add_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1071. return _mm_add_epi16(a.v, b.v);
  1072. }
  1073. template <> inline VecData<int32_t,4> add_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1074. return _mm_add_epi32(a.v, b.v);
  1075. }
  1076. template <> inline VecData<int64_t,2> add_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1077. return _mm_add_epi64(a.v, b.v);
  1078. }
  1079. template <> inline VecData<float,4> add_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1080. return _mm_add_ps(a.v, b.v);
  1081. }
  1082. template <> inline VecData<double,2> add_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1083. return _mm_add_pd(a.v, b.v);
  1084. }
  1085. template <> inline VecData<int8_t,16> sub_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1086. return _mm_sub_epi8(a.v, b.v);
  1087. }
  1088. template <> inline VecData<int16_t,8> sub_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1089. return _mm_sub_epi16(a.v, b.v);
  1090. }
  1091. template <> inline VecData<int32_t,4> sub_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1092. return _mm_sub_epi32(a.v, b.v);
  1093. }
  1094. template <> inline VecData<int64_t,2> sub_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1095. return _mm_sub_epi64(a.v, b.v);
  1096. }
  1097. template <> inline VecData<float,4> sub_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1098. return _mm_sub_ps(a.v, b.v);
  1099. }
  1100. template <> inline VecData<double,2> sub_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1101. return _mm_sub_pd(a.v, b.v);
  1102. }
  1103. //template <> inline VecData<int8_t,16> fma_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b, const VecData<int8_t,16>& c) {}
  1104. //template <> inline VecData<int16_t,8> fma_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b, const VecData<int16_t,8>& c) {}
  1105. //template <> inline VecData<int32_t,4> sub_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b, const VecData<int32_t,4>& c) {}
  1106. //template <> inline VecData<int64_t,2> sub_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b, const VecData<int64_t,2>& c) {}
  1107. template <> inline VecData<float,4> fma_intrin(const VecData<float,4>& a, const VecData<float,4>& b, const VecData<float,4>& c) {
  1108. #ifdef __FMA__
  1109. return _mm_fmadd_ps(a.v, b.v, c.v);
  1110. #elif defined(__FMA4__)
  1111. return _mm_macc_ps(a.v, b.v, c.v);
  1112. #else
  1113. return add_intrin(mul_intrin(a,b), c);
  1114. #endif
  1115. }
  1116. template <> inline VecData<double,2> fma_intrin(const VecData<double,2>& a, const VecData<double,2>& b, const VecData<double,2>& c) {
  1117. #ifdef __FMA__
  1118. return _mm_fmadd_pd(a.v, b.v, c.v);
  1119. #elif defined(__FMA4__)
  1120. return _mm_macc_pd(a.v, b.v, c.v);
  1121. #else
  1122. return add_intrin(mul_intrin(a,b), c);
  1123. #endif
  1124. }
  1125. // Bitwise operators
  1126. template <> inline VecData<int8_t,16> not_intrin<VecData<int8_t,16>>(const VecData<int8_t,16>& a) {
  1127. return _mm_xor_si128(a.v, _mm_set1_epi32(-1));
  1128. }
  1129. template <> inline VecData<int16_t,8> not_intrin<VecData<int16_t,8>>(const VecData<int16_t,8>& a) {
  1130. return _mm_xor_si128(a.v, _mm_set1_epi32(-1));
  1131. }
  1132. template <> inline VecData<int32_t,4> not_intrin<VecData<int32_t,4>>(const VecData<int32_t,4>& a) {
  1133. return _mm_xor_si128(a.v, _mm_set1_epi32(-1));
  1134. }
  1135. template <> inline VecData<int64_t,2> not_intrin<VecData<int64_t,2>>(const VecData<int64_t,2>& a) {
  1136. return _mm_xor_si128(a.v, _mm_set1_epi32(-1));
  1137. }
  1138. template <> inline VecData<float,4> not_intrin<VecData<float,4>>(const VecData<float,4>& a) {
  1139. return _mm_xor_ps(a.v, _mm_castsi128_ps(_mm_set1_epi32(-1)));
  1140. }
  1141. template <> inline VecData<double,2> not_intrin<VecData<double,2>>(const VecData<double,2>& a) {
  1142. return _mm_xor_pd(a.v, _mm_castsi128_pd(_mm_set1_epi32(-1)));
  1143. }
  1144. template <> inline VecData<int8_t,16> and_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1145. return _mm_and_si128(a.v, b.v);
  1146. }
  1147. template <> inline VecData<int16_t,8> and_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1148. return _mm_and_si128(a.v, b.v);
  1149. }
  1150. template <> inline VecData<int32_t,4> and_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1151. return _mm_and_si128(a.v, b.v);
  1152. }
  1153. template <> inline VecData<int64_t,2> and_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1154. return _mm_and_si128(a.v, b.v);
  1155. }
  1156. template <> inline VecData<float,4> and_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1157. return _mm_and_ps(a.v, b.v);
  1158. }
  1159. template <> inline VecData<double,2> and_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1160. return _mm_and_pd(a.v, b.v);
  1161. }
  1162. template <> inline VecData<int8_t,16> xor_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1163. return _mm_xor_si128(a.v, b.v);
  1164. }
  1165. template <> inline VecData<int16_t,8> xor_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1166. return _mm_xor_si128(a.v, b.v);
  1167. }
  1168. template <> inline VecData<int32_t,4> xor_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1169. return _mm_xor_si128(a.v, b.v);
  1170. }
  1171. template <> inline VecData<int64_t,2> xor_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1172. return _mm_xor_si128(a.v, b.v);
  1173. }
  1174. template <> inline VecData<float,4> xor_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1175. return _mm_xor_ps(a.v, b.v);
  1176. }
  1177. template <> inline VecData<double,2> xor_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1178. return _mm_xor_pd(a.v, b.v);
  1179. }
  1180. template <> inline VecData<int8_t,16> or_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1181. return _mm_or_si128(a.v, b.v);
  1182. }
  1183. template <> inline VecData<int16_t,8> or_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1184. return _mm_or_si128(a.v, b.v);
  1185. }
  1186. template <> inline VecData<int32_t,4> or_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1187. return _mm_or_si128(a.v, b.v);
  1188. }
  1189. template <> inline VecData<int64_t,2> or_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1190. return _mm_or_si128(a.v, b.v);
  1191. }
  1192. template <> inline VecData<float,4> or_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1193. return _mm_or_ps(a.v, b.v);
  1194. }
  1195. template <> inline VecData<double,2> or_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1196. return _mm_or_pd(a.v, b.v);
  1197. }
  1198. template <> inline VecData<int8_t,16> andnot_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1199. return _mm_andnot_si128(b.v, a.v);
  1200. }
  1201. template <> inline VecData<int16_t,8> andnot_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1202. return _mm_andnot_si128(b.v, a.v);
  1203. }
  1204. template <> inline VecData<int32_t,4> andnot_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1205. return _mm_andnot_si128(b.v, a.v);
  1206. }
  1207. template <> inline VecData<int64_t,2> andnot_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1208. return _mm_andnot_si128(b.v, a.v);
  1209. }
  1210. template <> inline VecData<float,4> andnot_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1211. return _mm_andnot_ps(b.v, a.v);
  1212. }
  1213. template <> inline VecData<double,2> andnot_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1214. return _mm_andnot_pd(b.v, a.v);
  1215. }
  1216. // Bitshift
  1217. //template <> inline VecData<int8_t,16> bitshiftleft_intrin<VecData<int8_t,16>>(const VecData<int8_t,16>& a, const Integer& rhs) { }
  1218. template <> inline VecData<int16_t,8> bitshiftleft_intrin<VecData<int16_t,8>>(const VecData<int16_t,8>& a, const Integer& rhs) { return _mm_slli_epi16(a.v , rhs); }
  1219. template <> inline VecData<int32_t,4> bitshiftleft_intrin<VecData<int32_t,4>>(const VecData<int32_t,4>& a, const Integer& rhs) { return _mm_slli_epi32(a.v , rhs); }
  1220. template <> inline VecData<int64_t,2> bitshiftleft_intrin<VecData<int64_t,2>>(const VecData<int64_t,2>& a, const Integer& rhs) { return _mm_slli_epi64(a.v , rhs); }
  1221. template <> inline VecData<float ,4> bitshiftleft_intrin<VecData<float ,4>>(const VecData<float ,4>& a, const Integer& rhs) { return _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(a.v), rhs)); }
  1222. template <> inline VecData<double ,2> bitshiftleft_intrin<VecData<double ,2>>(const VecData<double ,2>& a, const Integer& rhs) { return _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(a.v), rhs)); }
  1223. //template <> inline VecData<int8_t,16> bitshiftright_intrin<VecData<int8_t,16>>(const VecData<int8_t,16>& a, const Integer& rhs) { }
  1224. template <> inline VecData<int16_t,8> bitshiftright_intrin<VecData<int16_t,8>>(const VecData<int16_t,8>& a, const Integer& rhs) { return _mm_srli_epi16(a.v , rhs); }
  1225. template <> inline VecData<int32_t,4> bitshiftright_intrin<VecData<int32_t,4>>(const VecData<int32_t,4>& a, const Integer& rhs) { return _mm_srli_epi32(a.v , rhs); }
  1226. template <> inline VecData<int64_t,2> bitshiftright_intrin<VecData<int64_t,2>>(const VecData<int64_t,2>& a, const Integer& rhs) { return _mm_srli_epi64(a.v , rhs); }
  1227. template <> inline VecData<float ,4> bitshiftright_intrin<VecData<float ,4>>(const VecData<float ,4>& a, const Integer& rhs) { return _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(a.v), rhs)); }
  1228. template <> inline VecData<double ,2> bitshiftright_intrin<VecData<double ,2>>(const VecData<double ,2>& a, const Integer& rhs) { return _mm_castsi128_pd(_mm_srli_epi64(_mm_castpd_si128(a.v), rhs)); }
  1229. // Other functions
  1230. template <> inline VecData<int8_t,16> max_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1231. return _mm_max_epi8(a.v, b.v);
  1232. }
  1233. template <> inline VecData<int16_t,8> max_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1234. return _mm_max_epi16(a.v, b.v);
  1235. }
  1236. template <> inline VecData<int32_t,4> max_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1237. return _mm_max_epi32(a.v, b.v);
  1238. }
  1239. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1240. template <> inline VecData<int64_t,2> max_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1241. return _mm_max_epi64(a.v, b.v);
  1242. }
  1243. #endif
  1244. template <> inline VecData<float,4> max_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1245. return _mm_max_ps(a.v, b.v);
  1246. }
  1247. template <> inline VecData<double,2> max_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1248. return _mm_max_pd(a.v, b.v);
  1249. }
  1250. template <> inline VecData<int8_t,16> min_intrin(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) {
  1251. return _mm_min_epi8(a.v, b.v);
  1252. }
  1253. template <> inline VecData<int16_t,8> min_intrin(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) {
  1254. return _mm_min_epi16(a.v, b.v);
  1255. }
  1256. template <> inline VecData<int32_t,4> min_intrin(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) {
  1257. return _mm_min_epi32(a.v, b.v);
  1258. }
  1259. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1260. template <> inline VecData<int64_t,2> min_intrin(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) {
  1261. return _mm_min_epi64(a.v, b.v);
  1262. }
  1263. #endif
  1264. template <> inline VecData<float,4> min_intrin(const VecData<float,4>& a, const VecData<float,4>& b) {
  1265. return _mm_min_ps(a.v, b.v);
  1266. }
  1267. template <> inline VecData<double,2> min_intrin(const VecData<double,2>& a, const VecData<double,2>& b) {
  1268. return _mm_min_pd(a.v, b.v);
  1269. }
  1270. // Conversion operators
  1271. template <> inline VecData<float ,4> convert_int2real_intrin<VecData<float ,4>,VecData<int32_t,4>>(const VecData<int32_t,4>& x) {
  1272. return _mm_cvtepi32_ps(x.v);
  1273. }
  1274. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1275. template <> inline VecData<double,2> convert_int2real_intrin<VecData<double,2>,VecData<int64_t,2>>(const VecData<int64_t,2>& x) {
  1276. return _mm_cvtepi64_pd(x.v);
  1277. }
  1278. #endif
  1279. template <> inline VecData<int32_t,4> round_real2int_intrin<VecData<int32_t,4>,VecData<float ,4>>(const VecData<float ,4>& x) {
  1280. return _mm_cvtps_epi32(x.v);
  1281. }
  1282. template <> inline VecData<int64_t,2> round_real2int_intrin<VecData<int64_t,2>,VecData<double,2>>(const VecData<double,2>& x) {
  1283. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1284. return _mm_cvtpd_epi64(x.v);
  1285. #else
  1286. return _mm_cvtepi32_epi64(_mm_cvtpd_epi32(x.v));
  1287. #endif
  1288. }
  1289. template <> inline VecData<float ,4> round_real2real_intrin<VecData<float ,4>>(const VecData<float ,4>& x) { return _mm_round_ps(x.v, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); }
  1290. template <> inline VecData<double,2> round_real2real_intrin<VecData<double,2>>(const VecData<double,2>& x) { return _mm_round_pd(x.v, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); }
  1291. /////////////////////////////////////////////////////////////////////////////
  1292. /////////////////////////////////////////////////////////////////////////////
  1293. // Comparison operators
  1294. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::lt>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return Mask<VecData<int8_t,16>>(_mm_cmplt_epi8(a.v,b.v)); }
  1295. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::le>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1296. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::gt>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return Mask<VecData<int8_t,16>>(_mm_cmpgt_epi8(a.v,b.v)); }
  1297. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::ge>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1298. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::eq>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return Mask<VecData<int8_t,16>>(_mm_cmpeq_epi8(a.v,b.v)); }
  1299. template <> inline Mask<VecData<int8_t,16>> comp_intrin<ComparisonType::ne>(const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1300. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::lt>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return Mask<VecData<int16_t,8>>(_mm_cmplt_epi16(a.v,b.v));}
  1301. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::le>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1302. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::gt>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return Mask<VecData<int16_t,8>>(_mm_cmpgt_epi16(a.v,b.v));}
  1303. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::ge>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1304. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::eq>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return Mask<VecData<int16_t,8>>(_mm_cmpeq_epi16(a.v,b.v));}
  1305. template <> inline Mask<VecData<int16_t,8>> comp_intrin<ComparisonType::ne>(const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1306. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::lt>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return Mask<VecData<int32_t,4>>(_mm_cmplt_epi32(a.v,b.v));}
  1307. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::le>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1308. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::gt>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return Mask<VecData<int32_t,4>>(_mm_cmpgt_epi32(a.v,b.v));}
  1309. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::ge>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1310. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::eq>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return Mask<VecData<int32_t,4>>(_mm_cmpeq_epi32(a.v,b.v));}
  1311. template <> inline Mask<VecData<int32_t,4>> comp_intrin<ComparisonType::ne>(const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1312. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::lt>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return Mask<VecData<int64_t,2>>(_mm_cmpgt_epi64(b.v,a.v));}
  1313. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::le>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1314. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::gt>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return Mask<VecData<int64_t,2>>(_mm_cmpgt_epi64(a.v,b.v));}
  1315. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::ge>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1316. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::eq>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return Mask<VecData<int64_t,2>>(_mm_cmpeq_epi64(a.v,b.v));}
  1317. template <> inline Mask<VecData<int64_t,2>> comp_intrin<ComparisonType::ne>(const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1318. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::lt>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmplt_ps(a.v,b.v)); }
  1319. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::le>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmple_ps(a.v,b.v)); }
  1320. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::gt>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmpgt_ps(a.v,b.v)); }
  1321. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::ge>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmpge_ps(a.v,b.v)); }
  1322. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::eq>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmpeq_ps(a.v,b.v)); }
  1323. template <> inline Mask<VecData<float,4>> comp_intrin<ComparisonType::ne>(const VecData<float,4>& a, const VecData<float,4>& b) { return Mask<VecData<float,4>>(_mm_cmpneq_ps(a.v,b.v));}
  1324. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::lt>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmplt_pd(a.v,b.v)); }
  1325. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::le>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmple_pd(a.v,b.v)); }
  1326. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::gt>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmpgt_pd(a.v,b.v)); }
  1327. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::ge>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmpge_pd(a.v,b.v)); }
  1328. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::eq>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmpeq_pd(a.v,b.v)); }
  1329. template <> inline Mask<VecData<double,2>> comp_intrin<ComparisonType::ne>(const VecData<double,2>& a, const VecData<double,2>& b) { return Mask<VecData<double,2>>(_mm_cmpneq_pd(a.v,b.v));}
  1330. template <> inline VecData<int8_t,16> select_intrin(const Mask<VecData<int8_t,16>>& s, const VecData<int8_t,16>& a, const VecData<int8_t,16>& b) { return _mm_blendv_epi8(b.v, a.v, s.v); }
  1331. template <> inline VecData<int16_t,8> select_intrin(const Mask<VecData<int16_t,8>>& s, const VecData<int16_t,8>& a, const VecData<int16_t,8>& b) { return _mm_blendv_epi8(b.v, a.v, s.v); }
  1332. template <> inline VecData<int32_t,4> select_intrin(const Mask<VecData<int32_t,4>>& s, const VecData<int32_t,4>& a, const VecData<int32_t,4>& b) { return _mm_blendv_epi8(b.v, a.v, s.v); }
  1333. template <> inline VecData<int64_t,2> select_intrin(const Mask<VecData<int64_t,2>>& s, const VecData<int64_t,2>& a, const VecData<int64_t,2>& b) { return _mm_blendv_epi8(b.v, a.v, s.v); }
  1334. template <> inline VecData<float ,4> select_intrin(const Mask<VecData<float ,4>>& s, const VecData<float ,4>& a, const VecData<float ,4>& b) { return _mm_blendv_ps (b.v, a.v, s.v); }
  1335. template <> inline VecData<double ,2> select_intrin(const Mask<VecData<double ,2>>& s, const VecData<double ,2>& a, const VecData<double ,2>& b) { return _mm_blendv_pd (b.v, a.v, s.v); }
  1336. // Special functions
  1337. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<float,4>> {
  1338. static inline VecData<float,4> eval(const VecData<float,4>& a) {
  1339. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1340. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  1341. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,4>>::eval(_mm_maskz_rsqrt14_ps(~__mmask8(0), a.v), a.v);
  1342. #else
  1343. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  1344. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,4>>::eval(_mm_rsqrt_ps(a.v), a.v);
  1345. #endif
  1346. }
  1347. static inline VecData<float,4> eval(const VecData<float,4>& a, const Mask<VecData<float,4>>& m) {
  1348. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1349. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  1350. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,4>>::eval(_mm_maskz_rsqrt14_ps(_mm_movepi32_mask(_mm_castps_si128(m.v)), a.v), a.v);
  1351. #else
  1352. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  1353. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,4>>::eval(and_intrin(VecData<float,4>(_mm_rsqrt_ps(a.v)), convert_mask2vec_intrin(m)), a.v);
  1354. #endif
  1355. }
  1356. };
  1357. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<double,2>> {
  1358. static inline VecData<double,2> eval(const VecData<double,2>& a) {
  1359. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1360. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  1361. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,2>>::eval(_mm_maskz_rsqrt14_pd(~__mmask8(0), a.v), a.v);
  1362. #else
  1363. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  1364. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,2>>::eval(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(a.v))), a.v);
  1365. #endif
  1366. }
  1367. static inline VecData<double,2> eval(const VecData<double,2>& a, const Mask<VecData<double,2>>& m) {
  1368. #if defined(__AVX51DQ__) && defined(__AVX512VL__)
  1369. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  1370. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,2>>::eval(_mm_maskz_rsqrt14_pd(_mm_movepi64_mask(_mm_castpd_si128(m.v)), a.v), a.v);
  1371. #else
  1372. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  1373. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,2>>::eval(and_intrin(VecData<double,2>(_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(a.v)))), convert_mask2vec_intrin(m)), a.v);
  1374. #endif
  1375. }
  1376. };
  1377. #ifdef SCTL_HAVE_SVML
  1378. template <> inline void sincos_intrin<VecData<float ,4>>(VecData<float ,4>& sinx, VecData<float ,4>& cosx, const VecData<float ,4>& x) { sinx = _mm_sincos_ps(&cosx.v, x.v); }
  1379. template <> inline void sincos_intrin<VecData<double,2>>(VecData<double,2>& sinx, VecData<double,2>& cosx, const VecData<double,2>& x) { sinx = _mm_sincos_pd(&cosx.v, x.v); }
  1380. template <> inline VecData<float ,4> exp_intrin<VecData<float ,4>>(const VecData<float ,4>& x) { return _mm_exp_ps(x.v); }
  1381. template <> inline VecData<double,2> exp_intrin<VecData<double,2>>(const VecData<double,2>& x) { return _mm_exp_pd(x.v); }
  1382. #else
  1383. template <> inline void sincos_intrin<VecData<float ,4>>(VecData<float ,4>& sinx, VecData<float ,4>& cosx, const VecData<float ,4>& x) {
  1384. approx_sincos_intrin<(Integer)(TypeTraits<float>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  1385. }
  1386. template <> inline void sincos_intrin<VecData<double,2>>(VecData<double,2>& sinx, VecData<double,2>& cosx, const VecData<double,2>& x) {
  1387. approx_sincos_intrin<(Integer)(TypeTraits<double>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  1388. }
  1389. template <> inline VecData<float ,4> exp_intrin<VecData<float ,4>>(const VecData<float ,4>& x) {
  1390. return approx_exp_intrin<(Integer)(TypeTraits<float>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  1391. }
  1392. template <> inline VecData<double,2> exp_intrin<VecData<double,2>>(const VecData<double,2>& x) {
  1393. return approx_exp_intrin<(Integer)(TypeTraits<double>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  1394. }
  1395. #endif
  1396. #endif
  1397. }
  1398. namespace SCTL_NAMESPACE { // AVX
  1399. #ifdef __AVX__
  1400. template <> struct alignas(sizeof(int8_t) * 32) VecData<int8_t,32> {
  1401. using ScalarType = int8_t;
  1402. static constexpr Integer Size = 32;
  1403. VecData() = default;
  1404. inline VecData(__m256i v_) : v(v_) {}
  1405. __m256i v;
  1406. };
  1407. template <> struct alignas(sizeof(int16_t) * 16) VecData<int16_t,16> {
  1408. using ScalarType = int16_t;
  1409. static constexpr Integer Size = 16;
  1410. VecData() = default;
  1411. inline VecData(__m256i v_) : v(v_) {}
  1412. __m256i v;
  1413. };
  1414. template <> struct alignas(sizeof(int32_t) * 8) VecData<int32_t,8> {
  1415. using ScalarType = int32_t;
  1416. static constexpr Integer Size = 8;
  1417. VecData() = default;
  1418. inline VecData(__m256i v_) : v(v_) {}
  1419. __m256i v;
  1420. };
  1421. template <> struct alignas(sizeof(int64_t) * 4) VecData<int64_t,4> {
  1422. using ScalarType = int64_t;
  1423. static constexpr Integer Size = 4;
  1424. VecData() = default;
  1425. inline VecData(__m256i v_) : v(v_) {}
  1426. __m256i v;
  1427. };
  1428. template <> struct alignas(sizeof(float) * 8) VecData<float,8> {
  1429. using ScalarType = float;
  1430. static constexpr Integer Size = 8;
  1431. VecData() = default;
  1432. inline VecData(__m256 v_) : v(v_) {}
  1433. __m256 v;
  1434. };
  1435. template <> struct alignas(sizeof(double) * 4) VecData<double,4> {
  1436. using ScalarType = double;
  1437. static constexpr Integer Size = 4;
  1438. VecData() = default;
  1439. inline VecData(__m256d v_) : v(v_) {}
  1440. __m256d v;
  1441. };
  1442. // Select between two sources, byte by byte. Used in various functions and operators
  1443. // Corresponds to this pseudocode:
  1444. // for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i];
  1445. // Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed.
  1446. // Only bit 7 in each byte of s is checked,
  1447. #if defined(__AVX2__)
  1448. static inline __m256i selectb (__m256i const & s, __m256i const & a, __m256i const & b) {
  1449. return _mm256_blendv_epi8(b, a, s);
  1450. //union U {
  1451. // __m256i v;
  1452. // int8_t x[32];
  1453. //};
  1454. //U s_ = {s};
  1455. //U a_ = {a};
  1456. //U b_ = {b};
  1457. //for (Integer i = 0; i < 32; i++) {
  1458. // a_.x[i] = (s_.x[i] ? a_.x[i] : b_.x[i]);
  1459. //}
  1460. //return a_.v;
  1461. }
  1462. #endif
  1463. template <> inline VecData<int8_t,32> zero_intrin<VecData<int8_t,32>>() {
  1464. return _mm256_setzero_si256();
  1465. }
  1466. template <> inline VecData<int16_t,16> zero_intrin<VecData<int16_t,16>>() {
  1467. return _mm256_setzero_si256();
  1468. }
  1469. template <> inline VecData<int32_t,8> zero_intrin<VecData<int32_t,8>>() {
  1470. return _mm256_setzero_si256();
  1471. }
  1472. template <> inline VecData<int64_t,4> zero_intrin<VecData<int64_t,4>>() {
  1473. return _mm256_setzero_si256();
  1474. }
  1475. template <> inline VecData<float,8> zero_intrin<VecData<float,8>>() {
  1476. return _mm256_setzero_ps();
  1477. }
  1478. template <> inline VecData<double,4> zero_intrin<VecData<double,4>>() {
  1479. return _mm256_setzero_pd();
  1480. }
  1481. template <> inline VecData<int8_t,32> set1_intrin<VecData<int8_t,32>>(int8_t a) {
  1482. return _mm256_set1_epi8(a);
  1483. }
  1484. template <> inline VecData<int16_t,16> set1_intrin<VecData<int16_t,16>>(int16_t a) {
  1485. return _mm256_set1_epi16(a);
  1486. }
  1487. template <> inline VecData<int32_t,8> set1_intrin<VecData<int32_t,8>>(int32_t a) {
  1488. return _mm256_set1_epi32(a);
  1489. }
  1490. template <> inline VecData<int64_t,4> set1_intrin<VecData<int64_t,4>>(int64_t a) {
  1491. return _mm256_set1_epi64x(a);
  1492. }
  1493. template <> inline VecData<float,8> set1_intrin<VecData<float,8>>(float a) {
  1494. return _mm256_set1_ps(a);
  1495. }
  1496. template <> inline VecData<double,4> set1_intrin<VecData<double,4>>(double a) {
  1497. return _mm256_set1_pd(a);
  1498. }
  1499. template <> inline VecData<int8_t,32> set_intrin<VecData<int8_t,32>,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t>(int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31, int8_t v32) {
  1500. return _mm256_set_epi8(v32,v31,v30,v29,v28,v27,v26,v25,v24,v23,v22,v21,v20,v19,v18,v17,v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  1501. }
  1502. template <> inline VecData<int16_t,16> set_intrin<VecData<int16_t,16>,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t>(int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7, int16_t v8, int16_t v9, int16_t v10, int16_t v11, int16_t v12, int16_t v13, int16_t v14, int16_t v15, int16_t v16) {
  1503. return _mm256_set_epi16(v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  1504. }
  1505. template <> inline VecData<int32_t,8> set_intrin<VecData<int32_t,8>,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t>(int32_t v1, int32_t v2, int32_t v3, int32_t v4, int32_t v5, int32_t v6, int32_t v7, int32_t v8) {
  1506. return _mm256_set_epi32(v8,v7,v6,v5,v4,v3,v2,v1);
  1507. }
  1508. template <> inline VecData<int64_t,4> set_intrin<VecData<int64_t,4>,int64_t,int64_t,int64_t,int64_t>(int64_t v1, int64_t v2, int64_t v3, int64_t v4) {
  1509. return _mm256_set_epi64x(v4,v3,v2,v1);
  1510. }
  1511. template <> inline VecData<float,8> set_intrin<VecData<float,8>,float,float,float,float,float,float,float,float>(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) {
  1512. return _mm256_set_ps(v8,v7,v6,v5,v4,v3,v2,v1);
  1513. }
  1514. template <> inline VecData<double,4> set_intrin<VecData<double,4>,double,double,double,double>(double v1, double v2, double v3, double v4) {
  1515. return _mm256_set_pd(v4,v3,v2,v1);
  1516. }
  1517. template <> inline VecData<int8_t,32> load1_intrin<VecData<int8_t,32>>(int8_t const* p) {
  1518. return _mm256_set1_epi8(p[0]);
  1519. }
  1520. template <> inline VecData<int16_t,16> load1_intrin<VecData<int16_t,16>>(int16_t const* p) {
  1521. return _mm256_set1_epi16(p[0]);
  1522. }
  1523. template <> inline VecData<int32_t,8> load1_intrin<VecData<int32_t,8>>(int32_t const* p) {
  1524. return _mm256_set1_epi32(p[0]);
  1525. }
  1526. template <> inline VecData<int64_t,4> load1_intrin<VecData<int64_t,4>>(int64_t const* p) {
  1527. return _mm256_set1_epi64x(p[0]);
  1528. }
  1529. template <> inline VecData<float,8> load1_intrin<VecData<float,8>>(float const* p) {
  1530. return _mm256_broadcast_ss(p);
  1531. }
  1532. template <> inline VecData<double,4> load1_intrin<VecData<double,4>>(double const* p) {
  1533. return _mm256_broadcast_sd(p);
  1534. }
  1535. template <> inline VecData<int8_t,32> loadu_intrin<VecData<int8_t,32>>(int8_t const* p) {
  1536. return _mm256_loadu_si256((__m256i const*)p);
  1537. }
  1538. template <> inline VecData<int16_t,16> loadu_intrin<VecData<int16_t,16>>(int16_t const* p) {
  1539. return _mm256_loadu_si256((__m256i const*)p);
  1540. }
  1541. template <> inline VecData<int32_t,8> loadu_intrin<VecData<int32_t,8>>(int32_t const* p) {
  1542. return _mm256_loadu_si256((__m256i const*)p);
  1543. }
  1544. template <> inline VecData<int64_t,4> loadu_intrin<VecData<int64_t,4>>(int64_t const* p) {
  1545. return _mm256_loadu_si256((__m256i const*)p);
  1546. }
  1547. template <> inline VecData<float,8> loadu_intrin<VecData<float,8>>(float const* p) {
  1548. return _mm256_loadu_ps(p);
  1549. }
  1550. template <> inline VecData<double,4> loadu_intrin<VecData<double,4>>(double const* p) {
  1551. return _mm256_loadu_pd(p);
  1552. }
  1553. template <> inline VecData<int8_t,32> load_intrin<VecData<int8_t,32>>(int8_t const* p) {
  1554. return _mm256_load_si256((__m256i const*)p);
  1555. }
  1556. template <> inline VecData<int16_t,16> load_intrin<VecData<int16_t,16>>(int16_t const* p) {
  1557. return _mm256_load_si256((__m256i const*)p);
  1558. }
  1559. template <> inline VecData<int32_t,8> load_intrin<VecData<int32_t,8>>(int32_t const* p) {
  1560. return _mm256_load_si256((__m256i const*)p);
  1561. }
  1562. template <> inline VecData<int64_t,4> load_intrin<VecData<int64_t,4>>(int64_t const* p) {
  1563. return _mm256_load_si256((__m256i const*)p);
  1564. }
  1565. template <> inline VecData<float,8> load_intrin<VecData<float,8>>(float const* p) {
  1566. return _mm256_load_ps(p);
  1567. }
  1568. template <> inline VecData<double,4> load_intrin<VecData<double,4>>(double const* p) {
  1569. return _mm256_load_pd(p);
  1570. }
  1571. template <> inline void storeu_intrin<VecData<int8_t,32>>(int8_t* p, VecData<int8_t,32> vec) {
  1572. _mm256_storeu_si256((__m256i*)p, vec.v);
  1573. }
  1574. template <> inline void storeu_intrin<VecData<int16_t,16>>(int16_t* p, VecData<int16_t,16> vec) {
  1575. _mm256_storeu_si256((__m256i*)p, vec.v);
  1576. }
  1577. template <> inline void storeu_intrin<VecData<int32_t,8>>(int32_t* p, VecData<int32_t,8> vec) {
  1578. _mm256_storeu_si256((__m256i*)p, vec.v);
  1579. }
  1580. template <> inline void storeu_intrin<VecData<int64_t,4>>(int64_t* p, VecData<int64_t,4> vec) {
  1581. _mm256_storeu_si256((__m256i*)p, vec.v);
  1582. }
  1583. template <> inline void storeu_intrin<VecData<float,8>>(float* p, VecData<float,8> vec) {
  1584. _mm256_storeu_ps(p, vec.v);
  1585. }
  1586. template <> inline void storeu_intrin<VecData<double,4>>(double* p, VecData<double,4> vec) {
  1587. _mm256_storeu_pd(p, vec.v);
  1588. }
  1589. template <> inline void store_intrin<VecData<int8_t,32>>(int8_t* p, VecData<int8_t,32> vec) {
  1590. _mm256_store_si256((__m256i*)p, vec.v);
  1591. }
  1592. template <> inline void store_intrin<VecData<int16_t,16>>(int16_t* p, VecData<int16_t,16> vec) {
  1593. _mm256_store_si256((__m256i*)p, vec.v);
  1594. }
  1595. template <> inline void store_intrin<VecData<int32_t,8>>(int32_t* p, VecData<int32_t,8> vec) {
  1596. _mm256_store_si256((__m256i*)p, vec.v);
  1597. }
  1598. template <> inline void store_intrin<VecData<int64_t,4>>(int64_t* p, VecData<int64_t,4> vec) {
  1599. _mm256_store_si256((__m256i*)p, vec.v);
  1600. }
  1601. template <> inline void store_intrin<VecData<float,8>>(float* p, VecData<float,8> vec) {
  1602. _mm256_store_ps(p, vec.v);
  1603. }
  1604. template <> inline void store_intrin<VecData<double,4>>(double* p, VecData<double,4> vec) {
  1605. _mm256_store_pd(p, vec.v);
  1606. }
  1607. template <> inline void stream_store_intrin<VecData<int8_t,32>>(int8_t* p, VecData<int8_t,32> vec) {
  1608. _mm256_stream_si256((__m256i*)p, vec.v);
  1609. }
  1610. template <> inline void stream_store_intrin<VecData<int16_t,16>>(int16_t* p, VecData<int16_t,16> vec) {
  1611. _mm256_stream_si256((__m256i*)p, vec.v);
  1612. }
  1613. template <> inline void stream_store_intrin<VecData<int32_t,8>>(int32_t* p, VecData<int32_t,8> vec) {
  1614. _mm256_stream_si256((__m256i*)p, vec.v);
  1615. }
  1616. template <> inline void stream_store_intrin<VecData<int64_t,4>>(int64_t* p, VecData<int64_t,4> vec) {
  1617. _mm256_stream_si256((__m256i*)p, vec.v);
  1618. }
  1619. template <> inline void stream_store_intrin<VecData<float,8>>(float* p, VecData<float,8> vec) {
  1620. _mm256_stream_ps(p, vec.v);
  1621. }
  1622. template <> inline void stream_store_intrin<VecData<double,4>>(double* p, VecData<double,4> vec) {
  1623. _mm256_stream_pd(p, vec.v);
  1624. }
  1625. //template <> inline int8_t extract_intrin<VecData<int8_t,32>>(VecData<int8_t,32> vec, Integer i) {}
  1626. //template <> inline int16_t extract_intrin<VecData<int16_t,16>>(VecData<int16_t,16> vec, Integer i) {}
  1627. //template <> inline int32_t extract_intrin<VecData<int32_t,8>>(VecData<int32_t,8> vec, Integer i) {}
  1628. //template <> inline int64_t extract_intrin<VecData<int64_t,4>>(VecData<int64_t,4> vec, Integer i) {}
  1629. //template <> inline float extract_intrin<VecData<float,8>>(VecData<float,8> vec, Integer i) {}
  1630. //template <> inline double extract_intrin<VecData<double,4>>(VecData<double,4> vec, Integer i) {}
  1631. //template <> inline void insert_intrin<VecData<int8_t,32>>(VecData<int8_t,32>& vec, Integer i, int8_t value) {}
  1632. //template <> inline void insert_intrin<VecData<int16_t,16>>(VecData<int16_t,16>& vec, Integer i, int16_t value) {}
  1633. //template <> inline void insert_intrin<VecData<int32_t,8>>(VecData<int32_t,8>& vec, Integer i, int32_t value) {}
  1634. //template <> inline void insert_intrin<VecData<int64_t,4>>(VecData<int64_t,4>& vec, Integer i, int64_t value) {}
  1635. //template <> inline void insert_intrin<VecData<float,8>>(VecData<float,8>& vec, Integer i, float value) {}
  1636. //template <> inline void insert_intrin<VecData<double,4>>(VecData<double,4>& vec, Integer i, double value) {}
  1637. // Arithmetic operators
  1638. #ifdef __AVX2__
  1639. template <> inline VecData<int8_t,32> unary_minus_intrin<VecData<int8_t,32>>(const VecData<int8_t,32>& a) {
  1640. return _mm256_sub_epi8(_mm256_setzero_si256(), a.v);
  1641. }
  1642. template <> inline VecData<int16_t,16> unary_minus_intrin<VecData<int16_t,16>>(const VecData<int16_t,16>& a) {
  1643. return _mm256_sub_epi16(_mm256_setzero_si256(), a.v);
  1644. }
  1645. template <> inline VecData<int32_t,8> unary_minus_intrin<VecData<int32_t,8>>(const VecData<int32_t,8>& a) {
  1646. return _mm256_sub_epi32(_mm256_setzero_si256(), a.v);
  1647. }
  1648. template <> inline VecData<int64_t,4> unary_minus_intrin<VecData<int64_t,4>>(const VecData<int64_t,4>& a) {
  1649. return _mm256_sub_epi64(_mm256_setzero_si256(), a.v);
  1650. }
  1651. #endif
  1652. template <> inline VecData<float,8> unary_minus_intrin<VecData<float,8>>(const VecData<float,8>& a) {
  1653. return _mm256_xor_ps(a.v, _mm256_set1_ps(-0.0f));
  1654. }
  1655. template <> inline VecData<double,4> unary_minus_intrin<VecData<double,4>>(const VecData<double,4>& a) {
  1656. static constexpr union {
  1657. int32_t i[8];
  1658. __m256 ymm;
  1659. } u = {{0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000}};
  1660. return _mm256_xor_pd(a.v, _mm256_castps_pd(u.ymm));
  1661. }
  1662. #ifdef __AVX2__
  1663. template <> inline VecData<int8_t,32> mul_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1664. // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies
  1665. __m256i aodd = _mm256_srli_epi16(a.v,8); // odd numbered elements of a
  1666. __m256i bodd = _mm256_srli_epi16(b.v,8); // odd numbered elements of b
  1667. __m256i muleven = _mm256_mullo_epi16(a.v,b.v); // product of even numbered elements
  1668. __m256i mulodd = _mm256_mullo_epi16(aodd,bodd); // product of odd numbered elements
  1669. mulodd = _mm256_slli_epi16(mulodd,8); // put odd numbered elements back in place
  1670. #if defined(__AVX512VL__) && defined(__AVX512BW__)
  1671. return _mm256_mask_mov_epi8(mulodd, 0x55555555, muleven);
  1672. #else
  1673. __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for even positions
  1674. __m256i product = selectb(mask,muleven,mulodd); // interleave even and odd
  1675. return product;
  1676. #endif
  1677. }
  1678. template <> inline VecData<int16_t,16> mul_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1679. return _mm256_mullo_epi16(a.v, b.v);
  1680. }
  1681. template <> inline VecData<int32_t,8> mul_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1682. return _mm256_mullo_epi32(a.v, b.v);
  1683. }
  1684. template <> inline VecData<int64_t,4> mul_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1685. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1686. return _mm256_mullo_epi64(a.v, b.v);
  1687. #else
  1688. // Split into 32-bit multiplies
  1689. __m256i bswap = _mm256_shuffle_epi32(b.v,0xB1); // swap H<->L
  1690. __m256i prodlh = _mm256_mullo_epi32(a.v,bswap); // 32 bit L*H products
  1691. __m256i zero = _mm256_setzero_si256(); // 0
  1692. __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0
  1693. __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L
  1694. __m256i prodll = _mm256_mul_epu32(a.v,b.v); // a0Lb0L,a1Lb1L, 64 bit unsigned products
  1695. __m256i prod = _mm256_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32
  1696. return prod;
  1697. #endif
  1698. }
  1699. #endif
  1700. template <> inline VecData<float,8> mul_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1701. return _mm256_mul_ps(a.v, b.v);
  1702. }
  1703. template <> inline VecData<double,4> mul_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1704. return _mm256_mul_pd(a.v, b.v);
  1705. }
  1706. template <> inline VecData<float,8> div_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1707. return _mm256_div_ps(a.v, b.v);
  1708. }
  1709. template <> inline VecData<double,4> div_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1710. return _mm256_div_pd(a.v, b.v);
  1711. }
  1712. #ifdef __AVX2__
  1713. template <> inline VecData<int8_t,32> add_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1714. return _mm256_add_epi8(a.v, b.v);
  1715. }
  1716. template <> inline VecData<int16_t,16> add_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1717. return _mm256_add_epi16(a.v, b.v);
  1718. }
  1719. template <> inline VecData<int32_t,8> add_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1720. return _mm256_add_epi32(a.v, b.v);
  1721. }
  1722. template <> inline VecData<int64_t,4> add_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1723. return _mm256_add_epi64(a.v, b.v);
  1724. }
  1725. #endif
  1726. template <> inline VecData<float,8> add_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1727. return _mm256_add_ps(a.v, b.v);
  1728. }
  1729. template <> inline VecData<double,4> add_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1730. return _mm256_add_pd(a.v, b.v);
  1731. }
  1732. #ifdef __AVX2__
  1733. template <> inline VecData<int8_t,32> sub_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1734. return _mm256_sub_epi8(a.v, b.v);
  1735. }
  1736. template <> inline VecData<int16_t,16> sub_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1737. return _mm256_sub_epi16(a.v, b.v);
  1738. }
  1739. template <> inline VecData<int32_t,8> sub_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1740. return _mm256_sub_epi32(a.v, b.v);
  1741. }
  1742. template <> inline VecData<int64_t,4> sub_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1743. return _mm256_sub_epi64(a.v, b.v);
  1744. }
  1745. #endif
  1746. template <> inline VecData<float,8> sub_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1747. return _mm256_sub_ps(a.v, b.v);
  1748. }
  1749. template <> inline VecData<double,4> sub_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1750. return _mm256_sub_pd(a.v, b.v);
  1751. }
  1752. //template <> inline VecData<int8_t,32> fma_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b, const VecData<int8_t,32>& c) {}
  1753. //template <> inline VecData<int16_t,16> fma_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b, const VecData<int16_t,16>& c) {}
  1754. //template <> inline VecData<int32_t,8> sub_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b, const VecData<int32_t,8>& c) {}
  1755. //template <> inline VecData<int64_t,4> sub_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b, const VecData<int64_t,4>& c) {}
  1756. template <> inline VecData<float,8> fma_intrin(const VecData<float,8>& a, const VecData<float,8>& b, const VecData<float,8>& c) {
  1757. #ifdef __FMA__
  1758. return _mm256_fmadd_ps(a.v, b.v, c.v);
  1759. #elif defined(__FMA4__)
  1760. return _mm256_macc_ps(a.v, b.v, c.v);
  1761. #else
  1762. return add_intrin(mul_intrin(a,b), c);
  1763. #endif
  1764. }
  1765. template <> inline VecData<double,4> fma_intrin(const VecData<double,4>& a, const VecData<double,4>& b, const VecData<double,4>& c) {
  1766. #ifdef __FMA__
  1767. return _mm256_fmadd_pd(a.v, b.v, c.v);
  1768. #elif defined(__FMA4__)
  1769. return _mm256_macc_pd(a.v, b.v, c.v);
  1770. #else
  1771. return add_intrin(mul_intrin(a,b), c);
  1772. #endif
  1773. }
  1774. // Bitwise operators
  1775. template <> inline VecData<int8_t,32> not_intrin<VecData<int8_t,32>>(const VecData<int8_t,32>& a) {
  1776. #ifdef __AVX2__
  1777. return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1));
  1778. #else
  1779. return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(a.v), _mm256_castsi256_pd(_mm256_set1_epi32(-1))));
  1780. #endif
  1781. }
  1782. template <> inline VecData<int16_t,16> not_intrin<VecData<int16_t,16>>(const VecData<int16_t,16>& a) {
  1783. #ifdef __AVX2__
  1784. return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1));
  1785. #else
  1786. return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(a.v), _mm256_castsi256_pd(_mm256_set1_epi32(-1))));
  1787. #endif
  1788. }
  1789. template <> inline VecData<int32_t,8> not_intrin<VecData<int32_t,8>>(const VecData<int32_t,8>& a) {
  1790. #ifdef __AVX2__
  1791. return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1));
  1792. #else
  1793. return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(a.v), _mm256_castsi256_pd(_mm256_set1_epi32(-1))));
  1794. #endif
  1795. }
  1796. template <> inline VecData<int64_t,4> not_intrin<VecData<int64_t,4>>(const VecData<int64_t,4>& a) {
  1797. #ifdef __AVX2__
  1798. return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1));
  1799. #else
  1800. return _mm256_castpd_si256(_mm256_xor_pd(_mm256_castsi256_pd(a.v), _mm256_castsi256_pd(_mm256_set1_epi32(-1))));
  1801. #endif
  1802. }
  1803. template <> inline VecData<float,8> not_intrin<VecData<float,8>>(const VecData<float,8>& a) {
  1804. return _mm256_xor_ps(a.v, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
  1805. }
  1806. template <> inline VecData<double,4> not_intrin<VecData<double,4>>(const VecData<double,4>& a) {
  1807. return _mm256_xor_pd(a.v, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
  1808. }
  1809. #ifdef __AVX2__
  1810. template <> inline VecData<int8_t,32> and_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1811. return _mm256_and_si256(a.v, b.v);
  1812. }
  1813. template <> inline VecData<int16_t,16> and_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1814. return _mm256_and_si256(a.v, b.v);
  1815. }
  1816. template <> inline VecData<int32_t,8> and_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1817. return _mm256_and_si256(a.v, b.v);
  1818. }
  1819. template <> inline VecData<int64_t,4> and_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1820. return _mm256_and_si256(a.v, b.v);
  1821. }
  1822. #endif
  1823. template <> inline VecData<float,8> and_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1824. return _mm256_and_ps(a.v, b.v);
  1825. }
  1826. template <> inline VecData<double,4> and_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1827. return _mm256_and_pd(a.v, b.v);
  1828. }
  1829. #ifdef __AVX2__
  1830. template <> inline VecData<int8_t,32> xor_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1831. return _mm256_xor_si256(a.v, b.v);
  1832. }
  1833. template <> inline VecData<int16_t,16> xor_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1834. return _mm256_xor_si256(a.v, b.v);
  1835. }
  1836. template <> inline VecData<int32_t,8> xor_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1837. return _mm256_xor_si256(a.v, b.v);
  1838. }
  1839. template <> inline VecData<int64_t,4> xor_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1840. return _mm256_xor_si256(a.v, b.v);
  1841. }
  1842. #endif
  1843. template <> inline VecData<float,8> xor_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1844. return _mm256_xor_ps(a.v, b.v);
  1845. }
  1846. template <> inline VecData<double,4> xor_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1847. return _mm256_xor_pd(a.v, b.v);
  1848. }
  1849. #ifdef __AVX2__
  1850. template <> inline VecData<int8_t,32> or_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1851. return _mm256_or_si256(a.v, b.v);
  1852. }
  1853. template <> inline VecData<int16_t,16> or_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1854. return _mm256_or_si256(a.v, b.v);
  1855. }
  1856. template <> inline VecData<int32_t,8> or_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1857. return _mm256_or_si256(a.v, b.v);
  1858. }
  1859. template <> inline VecData<int64_t,4> or_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1860. return _mm256_or_si256(a.v, b.v);
  1861. }
  1862. #endif
  1863. template <> inline VecData<float,8> or_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1864. return _mm256_or_ps(a.v, b.v);
  1865. }
  1866. template <> inline VecData<double,4> or_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1867. return _mm256_or_pd(a.v, b.v);
  1868. }
  1869. #ifdef __AVX2__
  1870. template <> inline VecData<int8_t,32> andnot_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1871. return _mm256_andnot_si256(b.v, a.v);
  1872. }
  1873. template <> inline VecData<int16_t,16> andnot_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1874. return _mm256_andnot_si256(b.v, a.v);
  1875. }
  1876. template <> inline VecData<int32_t,8> andnot_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1877. return _mm256_andnot_si256(b.v, a.v);
  1878. }
  1879. template <> inline VecData<int64_t,4> andnot_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1880. return _mm256_andnot_si256(b.v, a.v);
  1881. }
  1882. #endif
  1883. template <> inline VecData<float,8> andnot_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1884. return _mm256_andnot_ps(b.v, a.v);
  1885. }
  1886. template <> inline VecData<double,4> andnot_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1887. return _mm256_andnot_pd(b.v, a.v);
  1888. }
  1889. // Bitshift
  1890. //template <> inline VecData<int8_t ,32> bitshiftleft_intrin<VecData<int8_t ,32>>(const VecData<int8_t ,32>& a, const Integer& rhs) { }
  1891. template <> inline VecData<int16_t,16> bitshiftleft_intrin<VecData<int16_t,16>>(const VecData<int16_t,16>& a, const Integer& rhs) { return _mm256_slli_epi16(a.v , rhs); }
  1892. #ifdef __AVX2__
  1893. template <> inline VecData<int32_t ,8> bitshiftleft_intrin<VecData<int32_t ,8>>(const VecData<int32_t ,8>& a, const Integer& rhs) { return _mm256_slli_epi32(a.v , rhs); }
  1894. template <> inline VecData<int64_t ,4> bitshiftleft_intrin<VecData<int64_t ,4>>(const VecData<int64_t ,4>& a, const Integer& rhs) { return _mm256_slli_epi64(a.v , rhs); }
  1895. #endif
  1896. template <> inline VecData<float ,8> bitshiftleft_intrin<VecData<float ,8>>(const VecData<float ,8>& a, const Integer& rhs) { return _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_castps_si256(a.v), rhs)); }
  1897. template <> inline VecData<double ,4> bitshiftleft_intrin<VecData<double ,4>>(const VecData<double ,4>& a, const Integer& rhs) { return _mm256_castsi256_pd(_mm256_slli_epi64(_mm256_castpd_si256(a.v), rhs)); }
  1898. //template <> inline VecData<int8_t ,32> bitshiftright_intrin<VecData<int8_t ,32>>(const VecData<int8_t ,32>& a, const Integer& rhs) { }
  1899. template <> inline VecData<int16_t,16> bitshiftright_intrin<VecData<int16_t,16>>(const VecData<int16_t,16>& a, const Integer& rhs) { return _mm256_srli_epi16(a.v , rhs); }
  1900. template <> inline VecData<int32_t ,8> bitshiftright_intrin<VecData<int32_t ,8>>(const VecData<int32_t ,8>& a, const Integer& rhs) { return _mm256_srli_epi32(a.v , rhs); }
  1901. template <> inline VecData<int64_t ,4> bitshiftright_intrin<VecData<int64_t ,4>>(const VecData<int64_t ,4>& a, const Integer& rhs) { return _mm256_srli_epi64(a.v , rhs); }
  1902. template <> inline VecData<float ,8> bitshiftright_intrin<VecData<float ,8>>(const VecData<float ,8>& a, const Integer& rhs) { return _mm256_castsi256_ps(_mm256_srli_epi32(_mm256_castps_si256(a.v), rhs)); }
  1903. template <> inline VecData<double ,4> bitshiftright_intrin<VecData<double ,4>>(const VecData<double ,4>& a, const Integer& rhs) { return _mm256_castsi256_pd(_mm256_srli_epi64(_mm256_castpd_si256(a.v), rhs)); }
  1904. // Other functions
  1905. #ifdef __AVX2__
  1906. template <> inline VecData<int8_t,32> max_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1907. return _mm256_max_epi8(a.v, b.v);
  1908. }
  1909. template <> inline VecData<int16_t,16> max_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1910. return _mm256_max_epi16(a.v, b.v);
  1911. }
  1912. template <> inline VecData<int32_t,8> max_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1913. return _mm256_max_epi32(a.v, b.v);
  1914. }
  1915. #endif
  1916. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1917. template <> inline VecData<int64_t,4> max_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1918. return _mm256_max_epi64(a.v, b.v);
  1919. }
  1920. #endif
  1921. template <> inline VecData<float,8> max_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1922. return _mm256_max_ps(a.v, b.v);
  1923. }
  1924. template <> inline VecData<double,4> max_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1925. return _mm256_max_pd(a.v, b.v);
  1926. }
  1927. #ifdef __AVX2__
  1928. template <> inline VecData<int8_t,32> min_intrin(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) {
  1929. return _mm256_min_epi8(a.v, b.v);
  1930. }
  1931. template <> inline VecData<int16_t,16> min_intrin(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) {
  1932. return _mm256_min_epi16(a.v, b.v);
  1933. }
  1934. template <> inline VecData<int32_t,8> min_intrin(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) {
  1935. return _mm256_min_epi32(a.v, b.v);
  1936. }
  1937. #endif
  1938. #if defined(__AVX512F__) && defined(__AVX512VL__)
  1939. template <> inline VecData<int64_t,4> min_intrin(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) {
  1940. return _mm256_min_epi64(a.v, b.v);
  1941. }
  1942. #endif
  1943. template <> inline VecData<float,8> min_intrin(const VecData<float,8>& a, const VecData<float,8>& b) {
  1944. return _mm256_min_ps(a.v, b.v);
  1945. }
  1946. template <> inline VecData<double,4> min_intrin(const VecData<double,4>& a, const VecData<double,4>& b) {
  1947. return _mm256_min_pd(a.v, b.v);
  1948. }
  1949. // Conversion operators
  1950. template <> inline VecData<float ,8> convert_int2real_intrin<VecData<float ,8>,VecData<int32_t,8>>(const VecData<int32_t,8>& x) {
  1951. return _mm256_cvtepi32_ps(x.v);
  1952. }
  1953. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1954. template <> inline VecData<double,4> convert_int2real_intrin<VecData<double,4>,VecData<int64_t,4>>(const VecData<int64_t,4>& x) {
  1955. return _mm256_cvtepi64_pd(x.v);
  1956. }
  1957. #endif
  1958. template <> inline VecData<int32_t,8> round_real2int_intrin<VecData<int32_t,8>,VecData<float ,8>>(const VecData<float ,8>& x) {
  1959. return _mm256_cvtps_epi32(x.v);
  1960. }
  1961. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  1962. template <> inline VecData<int64_t,4> round_real2int_intrin<VecData<int64_t,4>,VecData<double,4>>(const VecData<double,4>& x) {
  1963. return _mm256_cvtpd_epi64(x.v);
  1964. }
  1965. #elif defined(__AVX2__)
  1966. template <> inline VecData<int64_t,4> round_real2int_intrin<VecData<int64_t,4>,VecData<double,4>>(const VecData<double,4>& x) {
  1967. return _mm256_cvtepi32_epi64(_mm256_cvtpd_epi32(x.v));
  1968. }
  1969. #endif
  1970. template <> inline VecData<float ,8> round_real2real_intrin<VecData<float ,8>>(const VecData<float ,8>& x) { return _mm256_round_ps(x.v, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); }
  1971. template <> inline VecData<double,4> round_real2real_intrin<VecData<double,4>>(const VecData<double,4>& x) { return _mm256_round_pd(x.v, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); }
  1972. /////////////////////////////////////////////////////////////////////////////
  1973. /////////////////////////////////////////////////////////////////////////////
  1974. // Comparison operators
  1975. #ifdef __AVX2__
  1976. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::lt>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return Mask<VecData<int8_t,32>>(_mm256_cmpgt_epi8(b.v,a.v));}
  1977. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::le>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1978. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::gt>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return Mask<VecData<int8_t,32>>(_mm256_cmpgt_epi8(a.v,b.v));}
  1979. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::ge>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1980. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::eq>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return Mask<VecData<int8_t,32>>(_mm256_cmpeq_epi8(a.v,b.v));}
  1981. template <> inline Mask<VecData<int8_t,32>> comp_intrin<ComparisonType::ne>(const VecData<int8_t,32>& a, const VecData<int8_t,32>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1982. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::lt>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return Mask<VecData<int16_t,16>>(_mm256_cmpgt_epi16(b.v,a.v));}
  1983. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::le>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1984. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::gt>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return Mask<VecData<int16_t,16>>(_mm256_cmpgt_epi16(a.v,b.v));}
  1985. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::ge>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1986. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::eq>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return Mask<VecData<int16_t,16>>(_mm256_cmpeq_epi16(a.v,b.v));}
  1987. template <> inline Mask<VecData<int16_t,16>> comp_intrin<ComparisonType::ne>(const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1988. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::lt>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return Mask<VecData<int32_t,8>>(_mm256_cmpgt_epi32(b.v,a.v));}
  1989. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::le>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1990. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::gt>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return Mask<VecData<int32_t,8>>(_mm256_cmpgt_epi32(a.v,b.v));}
  1991. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::ge>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1992. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::eq>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return Mask<VecData<int32_t,8>>(_mm256_cmpeq_epi32(a.v,b.v));}
  1993. template <> inline Mask<VecData<int32_t,8>> comp_intrin<ComparisonType::ne>(const VecData<int32_t,8>& a, const VecData<int32_t,8>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  1994. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::lt>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return Mask<VecData<int64_t,4>>(_mm256_cmpgt_epi64(b.v,a.v));}
  1995. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::le>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return ~(comp_intrin<ComparisonType::lt>(b,a)); }
  1996. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::gt>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return Mask<VecData<int64_t,4>>(_mm256_cmpgt_epi64(a.v,b.v));}
  1997. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::ge>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return ~(comp_intrin<ComparisonType::gt>(b,a)); }
  1998. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::eq>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return Mask<VecData<int64_t,4>>(_mm256_cmpeq_epi64(a.v,b.v));}
  1999. template <> inline Mask<VecData<int64_t,4>> comp_intrin<ComparisonType::ne>(const VecData<int64_t,4>& a, const VecData<int64_t,4>& b) { return ~(comp_intrin<ComparisonType::eq>(a,b)); }
  2000. #endif
  2001. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::lt>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_LT_OS)); }
  2002. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::le>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_LE_OS)); }
  2003. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::gt>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_GT_OS)); }
  2004. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::ge>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_GE_OS)); }
  2005. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::eq>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_EQ_OS)); }
  2006. template <> inline Mask<VecData<float,8>> comp_intrin<ComparisonType::ne>(const VecData<float,8>& a, const VecData<float,8>& b) { return Mask<VecData<float,8>>(_mm256_cmp_ps(a.v, b.v, _CMP_NEQ_OS));}
  2007. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::lt>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_LT_OS)); }
  2008. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::le>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_LE_OS)); }
  2009. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::gt>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_GT_OS)); }
  2010. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::ge>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_GE_OS)); }
  2011. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::eq>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_EQ_OS)); }
  2012. template <> inline Mask<VecData<double,4>> comp_intrin<ComparisonType::ne>(const VecData<double,4>& a, const VecData<double,4>& b) { return Mask<VecData<double,4>>(_mm256_cmp_pd(a.v, b.v, _CMP_NEQ_OS));}
  2013. #if defined(__AVX2__)
  2014. template <> inline VecData<int8_t ,32> select_intrin(const Mask<VecData<int8_t ,32>>& s, const VecData<int8_t ,32>& a, const VecData<int8_t ,32>& b) { return _mm256_blendv_epi8(b.v, a.v, s.v); }
  2015. template <> inline VecData<int16_t,16> select_intrin(const Mask<VecData<int16_t,16>>& s, const VecData<int16_t,16>& a, const VecData<int16_t,16>& b) { return _mm256_blendv_epi8(b.v, a.v, s.v); }
  2016. template <> inline VecData<int32_t ,8> select_intrin(const Mask<VecData<int32_t ,8>>& s, const VecData<int32_t ,8>& a, const VecData<int32_t ,8>& b) { return _mm256_blendv_epi8(b.v, a.v, s.v); }
  2017. template <> inline VecData<int64_t ,4> select_intrin(const Mask<VecData<int64_t ,4>>& s, const VecData<int64_t ,4>& a, const VecData<int64_t ,4>& b) { return _mm256_blendv_epi8(b.v, a.v, s.v); }
  2018. #endif
  2019. template <> inline VecData<float ,8> select_intrin(const Mask<VecData<float ,8>>& s, const VecData<float ,8>& a, const VecData<float ,8>& b) { return _mm256_blendv_ps (b.v, a.v, s.v); }
  2020. template <> inline VecData<double ,4> select_intrin(const Mask<VecData<double ,4>>& s, const VecData<double ,4>& a, const VecData<double ,4>& b) { return _mm256_blendv_pd (b.v, a.v, s.v); }
  2021. // Special functions
  2022. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<float,8>> {
  2023. static inline VecData<float,8> eval(const VecData<float,8>& a) {
  2024. #if defined(__AVX512F__) && defined(__AVX512VL__)
  2025. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2026. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,8>>::eval(_mm256_maskz_rsqrt14_ps(~__mmask8(0), a.v), a.v);
  2027. #else
  2028. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  2029. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,8>>::eval(_mm256_rsqrt_ps(a.v), a.v);
  2030. #endif
  2031. }
  2032. static inline VecData<float,8> eval(const VecData<float,8>& a, const Mask<VecData<float,8>>& m) {
  2033. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  2034. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2035. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,8>>::eval(_mm256_maskz_rsqrt14_ps(_mm256_movepi32_mask(_mm256_castps_si256(m.v)), a.v), a.v);
  2036. #else
  2037. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  2038. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,8>>::eval(and_intrin(VecData<float,8>(_mm256_rsqrt_ps(a.v)), convert_mask2vec_intrin(m)), a.v);
  2039. #endif
  2040. }
  2041. };
  2042. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<double,4>> {
  2043. static inline VecData<double,4> eval(const VecData<double,4>& a) {
  2044. #if defined(__AVX512F__) && defined(__AVX512VL__)
  2045. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2046. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,4>>::eval(_mm256_maskz_rsqrt14_pd(~__mmask8(0), a.v), a.v);
  2047. #else
  2048. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  2049. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,4>>::eval(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(a.v))), a.v);
  2050. #endif
  2051. }
  2052. static inline VecData<double,4> eval(const VecData<double,4>& a, const Mask<VecData<double,4>>& m) {
  2053. #if defined(__AVX512DQ__) && defined(__AVX512VL__)
  2054. constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2055. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,4>>::eval(_mm256_maskz_rsqrt14_pd(_mm256_movepi64_mask(_mm256_castpd_si256(m.v)), a.v), a.v);
  2056. #else
  2057. constexpr Integer newton_iter = mylog2((Integer)(digits/3.4362686889));
  2058. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,4>>::eval(and_intrin(VecData<double,4>(_mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(a.v)))), convert_mask2vec_intrin(m)), a.v);
  2059. #endif
  2060. }
  2061. };
  2062. #ifdef SCTL_HAVE_SVML
  2063. template <> inline void sincos_intrin<VecData<float ,8>>(VecData<float ,8>& sinx, VecData<float ,8>& cosx, const VecData<float ,8>& x) { sinx = _mm256_sincos_ps(&cosx.v, x.v); }
  2064. template <> inline void sincos_intrin<VecData<double,4>>(VecData<double,4>& sinx, VecData<double,4>& cosx, const VecData<double,4>& x) { sinx = _mm256_sincos_pd(&cosx.v, x.v); }
  2065. template <> inline VecData<float ,8> exp_intrin<VecData<float ,8>>(const VecData<float ,8>& x) { return _mm256_exp_ps(x.v); }
  2066. template <> inline VecData<double,4> exp_intrin<VecData<double,4>>(const VecData<double,4>& x) { return _mm256_exp_pd(x.v); }
  2067. #else
  2068. template <> inline void sincos_intrin<VecData<float ,8>>(VecData<float ,8>& sinx, VecData<float ,8>& cosx, const VecData<float ,8>& x) {
  2069. approx_sincos_intrin<(Integer)(TypeTraits<float>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  2070. }
  2071. template <> inline void sincos_intrin<VecData<double,4>>(VecData<double,4>& sinx, VecData<double,4>& cosx, const VecData<double,4>& x) {
  2072. approx_sincos_intrin<(Integer)(TypeTraits<double>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  2073. }
  2074. template <> inline VecData<float ,8> exp_intrin<VecData<float ,8>>(const VecData<float ,8>& x) {
  2075. return approx_exp_intrin<(Integer)(TypeTraits<float>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  2076. }
  2077. template <> inline VecData<double,4> exp_intrin<VecData<double,4>>(const VecData<double,4>& x) {
  2078. return approx_exp_intrin<(Integer)(TypeTraits<double>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  2079. }
  2080. #endif
  2081. #endif
  2082. }
  2083. namespace SCTL_NAMESPACE { // AVX512
  2084. #if defined(__AVX512F__)
  2085. template <> struct alignas(sizeof(int8_t) * 64) VecData<int8_t,64> {
  2086. using ScalarType = int8_t;
  2087. static constexpr Integer Size = 64;
  2088. VecData() = default;
  2089. inline VecData(__m512i v_) : v(v_) {}
  2090. __m512i v;
  2091. };
  2092. template <> struct alignas(sizeof(int16_t) * 32) VecData<int16_t,32> {
  2093. using ScalarType = int16_t;
  2094. static constexpr Integer Size = 32;
  2095. VecData() = default;
  2096. inline VecData(__m512i v_) : v(v_) {}
  2097. __m512i v;
  2098. };
  2099. template <> struct alignas(sizeof(int32_t) * 16) VecData<int32_t,16> {
  2100. using ScalarType = int32_t;
  2101. static constexpr Integer Size = 16;
  2102. VecData() = default;
  2103. inline VecData(__m512i v_) : v(v_) {}
  2104. __m512i v;
  2105. };
  2106. template <> struct alignas(sizeof(int64_t) * 8) VecData<int64_t,8> {
  2107. using ScalarType = int64_t;
  2108. static constexpr Integer Size = 8;
  2109. VecData() = default;
  2110. inline VecData(__m512i v_) : v(v_) {}
  2111. __m512i v;
  2112. };
  2113. template <> struct alignas(sizeof(float) * 16) VecData<float,16> {
  2114. using ScalarType = float;
  2115. static constexpr Integer Size = 16;
  2116. VecData() = default;
  2117. inline VecData(__m512 v_) : v(v_) {}
  2118. __m512 v;
  2119. };
  2120. template <> struct alignas(sizeof(double) * 8) VecData<double,8> {
  2121. using ScalarType = double;
  2122. static constexpr Integer Size = 8;
  2123. inline VecData(__m512d v_) : v(v_) {}
  2124. VecData() = default;
  2125. __m512d v;
  2126. };
  2127. template <> inline VecData<int8_t,64> zero_intrin<VecData<int8_t,64>>() {
  2128. return _mm512_setzero_si512();
  2129. }
  2130. template <> inline VecData<int16_t,32> zero_intrin<VecData<int16_t,32>>() {
  2131. return _mm512_setzero_si512();
  2132. }
  2133. template <> inline VecData<int32_t,16> zero_intrin<VecData<int32_t,16>>() {
  2134. return _mm512_setzero_si512();
  2135. }
  2136. template <> inline VecData<int64_t,8> zero_intrin<VecData<int64_t,8>>() {
  2137. return _mm512_setzero_si512();
  2138. }
  2139. template <> inline VecData<float,16> zero_intrin<VecData<float,16>>() {
  2140. return _mm512_setzero_ps();
  2141. }
  2142. template <> inline VecData<double,8> zero_intrin<VecData<double,8>>() {
  2143. return _mm512_setzero_pd();
  2144. }
  2145. template <> inline VecData<int8_t,64> set1_intrin<VecData<int8_t,64>>(int8_t a) {
  2146. return _mm512_set1_epi8(a);
  2147. }
  2148. template <> inline VecData<int16_t,32> set1_intrin<VecData<int16_t,32>>(int16_t a) {
  2149. return _mm512_set1_epi16(a);
  2150. }
  2151. template <> inline VecData<int32_t,16> set1_intrin<VecData<int32_t,16>>(int32_t a) {
  2152. return _mm512_set1_epi32(a);
  2153. }
  2154. template <> inline VecData<int64_t,8> set1_intrin<VecData<int64_t,8>>(int64_t a) {
  2155. return _mm512_set1_epi64(a);
  2156. }
  2157. template <> inline VecData<float,16> set1_intrin<VecData<float,16>>(float a) {
  2158. return _mm512_set1_ps(a);
  2159. }
  2160. template <> inline VecData<double,8> set1_intrin<VecData<double,8>>(double a) {
  2161. return _mm512_set1_pd(a);
  2162. }
  2163. template <> inline VecData<int8_t,64> set_intrin<VecData<int8_t,64>,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t,int8_t>(int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17, int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23, int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29, int8_t v30, int8_t v31, int8_t v32, int8_t v33, int8_t v34, int8_t v35, int8_t v36, int8_t v37, int8_t v38, int8_t v39, int8_t v40, int8_t v41, int8_t v42, int8_t v43, int8_t v44, int8_t v45, int8_t v46, int8_t v47, int8_t v48, int8_t v49, int8_t v50, int8_t v51, int8_t v52, int8_t v53, int8_t v54, int8_t v55, int8_t v56, int8_t v57, int8_t v58, int8_t v59, int8_t v60, int8_t v61, int8_t v62, int8_t v63, int8_t v64) {
  2164. return _mm512_set_epi8(v64,v63,v62,v61,v60,v59,v58,v57,v56,v55,v54,v53,v52,v51,v50,v49,v48,v47,v46,v45,v44,v43,v42,v41,v40,v39,v38,v37,v36,v35,v34,v33,v32,v31,v30,v29,v28,v27,v26,v25,v24,v23,v22,v21,v20,v19,v18,v17,v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  2165. }
  2166. template <> inline VecData<int16_t,32> set_intrin<VecData<int16_t,32>,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t,int16_t>(int16_t v1, int16_t v2, int16_t v3, int16_t v4, int16_t v5, int16_t v6, int16_t v7, int16_t v8, int16_t v9, int16_t v10, int16_t v11, int16_t v12, int16_t v13, int16_t v14, int16_t v15, int16_t v16, int16_t v17, int16_t v18, int16_t v19, int16_t v20, int16_t v21, int16_t v22, int16_t v23, int16_t v24, int16_t v25, int16_t v26, int16_t v27, int16_t v28, int16_t v29, int16_t v30, int16_t v31, int16_t v32) {
  2167. return _mm512_set_epi16(v32,v31,v30,v29,v28,v27,v26,v25,v24,v23,v22,v21,v20,v19,v18,v17,v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  2168. }
  2169. template <> inline VecData<int32_t,16> set_intrin<VecData<int32_t,16>,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t>(int32_t v1, int32_t v2, int32_t v3, int32_t v4, int32_t v5, int32_t v6, int32_t v7, int32_t v8, int32_t v9, int32_t v10, int32_t v11, int32_t v12, int32_t v13, int32_t v14, int32_t v15, int32_t v16) {
  2170. return _mm512_set_epi32(v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  2171. }
  2172. template <> inline VecData<int64_t,8> set_intrin<VecData<int64_t,8>,int64_t,int64_t,int64_t,int64_t,int64_t,int64_t,int64_t,int64_t>(int64_t v1, int64_t v2, int64_t v3, int64_t v4, int64_t v5, int64_t v6, int64_t v7, int64_t v8) {
  2173. return _mm512_set_epi64(v8,v7,v6,v5,v4,v3,v2,v1);
  2174. }
  2175. template <> inline VecData<float,16> set_intrin<VecData<float,16>,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float,float>(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15, float v16) {
  2176. return _mm512_set_ps(v16,v15,v14,v13,v12,v11,v10,v9,v8,v7,v6,v5,v4,v3,v2,v1);
  2177. }
  2178. template <> inline VecData<double,8> set_intrin<VecData<double,8>,double,double,double,double,double,double,double,double>(double v1, double v2, double v3, double v4, double v5, double v6, double v7, double v8) {
  2179. return _mm512_set_pd(v8,v7,v6,v5,v4,v3,v2,v1);
  2180. }
  2181. template <> inline VecData<int8_t,64> load1_intrin<VecData<int8_t,64>>(int8_t const* p) {
  2182. return _mm512_set1_epi8(p[0]);
  2183. }
  2184. template <> inline VecData<int16_t,32> load1_intrin<VecData<int16_t,32>>(int16_t const* p) {
  2185. return _mm512_set1_epi16(p[0]);
  2186. }
  2187. template <> inline VecData<int32_t,16> load1_intrin<VecData<int32_t,16>>(int32_t const* p) {
  2188. return _mm512_set1_epi32(p[0]);
  2189. }
  2190. template <> inline VecData<int64_t,8> load1_intrin<VecData<int64_t,8>>(int64_t const* p) {
  2191. return _mm512_set1_epi64(p[0]);
  2192. }
  2193. template <> inline VecData<float,16> load1_intrin<VecData<float,16>>(float const* p) {
  2194. return _mm512_set1_ps(p[0]);
  2195. }
  2196. template <> inline VecData<double,8> load1_intrin<VecData<double,8>>(double const* p) {
  2197. return _mm512_set1_pd(p[0]);
  2198. }
  2199. template <> inline VecData<int8_t,64> loadu_intrin<VecData<int8_t,64>>(int8_t const* p) {
  2200. return _mm512_loadu_si512((__m512i const*)p);
  2201. }
  2202. template <> inline VecData<int16_t,32> loadu_intrin<VecData<int16_t,32>>(int16_t const* p) {
  2203. return _mm512_loadu_si512((__m512i const*)p);
  2204. }
  2205. template <> inline VecData<int32_t,16> loadu_intrin<VecData<int32_t,16>>(int32_t const* p) {
  2206. return _mm512_loadu_si512((__m512i const*)p);
  2207. }
  2208. template <> inline VecData<int64_t,8> loadu_intrin<VecData<int64_t,8>>(int64_t const* p) {
  2209. return _mm512_loadu_si512((__m512i const*)p);
  2210. }
  2211. template <> inline VecData<float,16> loadu_intrin<VecData<float,16>>(float const* p) {
  2212. return _mm512_loadu_ps(p);
  2213. }
  2214. template <> inline VecData<double,8> loadu_intrin<VecData<double,8>>(double const* p) {
  2215. return _mm512_loadu_pd(p);
  2216. }
  2217. template <> inline VecData<int8_t,64> load_intrin<VecData<int8_t,64>>(int8_t const* p) {
  2218. return _mm512_load_si512((__m512i const*)p);
  2219. }
  2220. template <> inline VecData<int16_t,32> load_intrin<VecData<int16_t,32>>(int16_t const* p) {
  2221. return _mm512_load_si512((__m512i const*)p);
  2222. }
  2223. template <> inline VecData<int32_t,16> load_intrin<VecData<int32_t,16>>(int32_t const* p) {
  2224. return _mm512_load_si512((__m512i const*)p);
  2225. }
  2226. template <> inline VecData<int64_t,8> load_intrin<VecData<int64_t,8>>(int64_t const* p) {
  2227. return _mm512_load_si512((__m512i const*)p);
  2228. }
  2229. template <> inline VecData<float,16> load_intrin<VecData<float,16>>(float const* p) {
  2230. return _mm512_load_ps(p);
  2231. }
  2232. template <> inline VecData<double,8> load_intrin<VecData<double,8>>(double const* p) {
  2233. return _mm512_load_pd(p);
  2234. }
  2235. template <> inline void storeu_intrin<VecData<int8_t,64>>(int8_t* p, VecData<int8_t,64> vec) {
  2236. _mm512_storeu_si512((__m512i*)p, vec.v);
  2237. }
  2238. template <> inline void storeu_intrin<VecData<int16_t,32>>(int16_t* p, VecData<int16_t,32> vec) {
  2239. _mm512_storeu_si512((__m512i*)p, vec.v);
  2240. }
  2241. template <> inline void storeu_intrin<VecData<int32_t,16>>(int32_t* p, VecData<int32_t,16> vec) {
  2242. _mm512_storeu_si512((__m512i*)p, vec.v);
  2243. }
  2244. template <> inline void storeu_intrin<VecData<int64_t,8>>(int64_t* p, VecData<int64_t,8> vec) {
  2245. _mm512_storeu_si512((__m512i*)p, vec.v);
  2246. }
  2247. template <> inline void storeu_intrin<VecData<float,16>>(float* p, VecData<float,16> vec) {
  2248. _mm512_storeu_ps(p, vec.v);
  2249. }
  2250. template <> inline void storeu_intrin<VecData<double,8>>(double* p, VecData<double,8> vec) {
  2251. _mm512_storeu_pd(p, vec.v);
  2252. }
  2253. template <> inline void store_intrin<VecData<int8_t,64>>(int8_t* p, VecData<int8_t,64> vec) {
  2254. _mm512_store_si512((__m512i*)p, vec.v);
  2255. }
  2256. template <> inline void store_intrin<VecData<int16_t,32>>(int16_t* p, VecData<int16_t,32> vec) {
  2257. _mm512_store_si512((__m512i*)p, vec.v);
  2258. }
  2259. template <> inline void store_intrin<VecData<int32_t,16>>(int32_t* p, VecData<int32_t,16> vec) {
  2260. _mm512_store_si512((__m512i*)p, vec.v);
  2261. }
  2262. template <> inline void store_intrin<VecData<int64_t,8>>(int64_t* p, VecData<int64_t,8> vec) {
  2263. _mm512_store_si512((__m512i*)p, vec.v);
  2264. }
  2265. template <> inline void store_intrin<VecData<float,16>>(float* p, VecData<float,16> vec) {
  2266. _mm512_store_ps(p, vec.v);
  2267. }
  2268. template <> inline void store_intrin<VecData<double,8>>(double* p, VecData<double,8> vec) {
  2269. _mm512_store_pd(p, vec.v);
  2270. }
  2271. template <> inline void stream_store_intrin<VecData<int8_t,64>>(int8_t* p, VecData<int8_t,64> vec) {
  2272. _mm512_stream_si512((__m512i*)p, vec.v);
  2273. }
  2274. template <> inline void stream_store_intrin<VecData<int16_t,32>>(int16_t* p, VecData<int16_t,32> vec) {
  2275. _mm512_stream_si512((__m512i*)p, vec.v);
  2276. }
  2277. template <> inline void stream_store_intrin<VecData<int32_t,16>>(int32_t* p, VecData<int32_t,16> vec) {
  2278. _mm512_stream_si512((__m512i*)p, vec.v);
  2279. }
  2280. template <> inline void stream_store_intrin<VecData<int64_t,8>>(int64_t* p, VecData<int64_t,8> vec) {
  2281. _mm512_stream_si512((__m512i*)p, vec.v);
  2282. }
  2283. template <> inline void stream_store_intrin<VecData<float,16>>(float* p, VecData<float,16> vec) {
  2284. _mm512_stream_ps(p, vec.v);
  2285. }
  2286. template <> inline void stream_store_intrin<VecData<double,8>>(double* p, VecData<double,8> vec) {
  2287. _mm512_stream_pd(p, vec.v);
  2288. }
  2289. //template <> inline int8_t extract_intrin<VecData<int8_t,64>>(VecData<int8_t,64> vec, Integer i) {}
  2290. //template <> inline int16_t extract_intrin<VecData<int16_t,32>>(VecData<int16_t,32> vec, Integer i) {}
  2291. //template <> inline int32_t extract_intrin<VecData<int32_t,16>>(VecData<int32_t,16> vec, Integer i) {}
  2292. //template <> inline int64_t extract_intrin<VecData<int64_t,8>>(VecData<int64_t,8> vec, Integer i) {}
  2293. //template <> inline float extract_intrin<VecData<float,16>>(VecData<float,16> vec, Integer i) {}
  2294. //template <> inline double extract_intrin<VecData<double,8>>(VecData<double,8> vec, Integer i) {}
  2295. //template <> inline void insert_intrin<VecData<int8_t,64>>(VecData<int8_t,64>& vec, Integer i, int8_t value) {}
  2296. //template <> inline void insert_intrin<VecData<int16_t,32>>(VecData<int16_t,32>& vec, Integer i, int16_t value) {}
  2297. //template <> inline void insert_intrin<VecData<int32_t,16>>(VecData<int32_t,16>& vec, Integer i, int32_t value) {}
  2298. //template <> inline void insert_intrin<VecData<int64_t,8>>(VecData<int64_t,8>& vec, Integer i, int64_t value) {}
  2299. //template <> inline void insert_intrin<VecData<float,16>>(VecData<float,16>& vec, Integer i, float value) {}
  2300. //template <> inline void insert_intrin<VecData<double,8>>(VecData<double,8>& vec, Integer i, double value) {}
  2301. // Arithmetic operators
  2302. //template <> inline VecData<int8_t,64> unary_minus_intrin<VecData<int8_t,64>>(const VecData<int8_t,64>& a) {}
  2303. //template <> inline VecData<int16_t,32> unary_minus_intrin<VecData<int16_t,32>>(const VecData<int16_t,32>& a) {}
  2304. template <> inline VecData<int32_t,16> unary_minus_intrin<VecData<int32_t,16>>(const VecData<int32_t,16>& a) {
  2305. return _mm512_sub_epi32(_mm512_setzero_epi32(), a.v);
  2306. }
  2307. template <> inline VecData<int64_t,8> unary_minus_intrin<VecData<int64_t,8>>(const VecData<int64_t,8>& a) {
  2308. return _mm512_sub_epi64(_mm512_setzero_epi32(), a.v);
  2309. }
  2310. template <> inline VecData<float,16> unary_minus_intrin<VecData<float,16>>(const VecData<float,16>& a) {
  2311. #ifdef __AVX512DQ__
  2312. return _mm512_xor_ps(a.v, _mm512_castsi512_ps(_mm512_set1_epi32(0x80000000)));
  2313. #else
  2314. return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(a.v), _mm512_set1_epi32(0x80000000)));
  2315. #endif
  2316. }
  2317. template <> inline VecData<double,8> unary_minus_intrin<VecData<double,8>>(const VecData<double,8>& a) {
  2318. #ifdef __AVX512DQ__
  2319. return _mm512_xor_pd(a.v, _mm512_castsi512_pd(_mm512_set1_epi64(0x8000000000000000)));
  2320. #else
  2321. return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(a.v), _mm512_set1_epi64(0x8000000000000000)));
  2322. #endif
  2323. }
  2324. //template <> inline VecData<int8_t,64> mul_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {}
  2325. //template <> inline VecData<int16_t,32> mul_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {}
  2326. template <> inline VecData<int32_t,16> mul_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2327. return _mm512_mullo_epi32(a.v, b.v);
  2328. }
  2329. template <> inline VecData<int64_t,8> mul_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2330. #if defined(__AVX512DQ__)
  2331. return _mm512_mullo_epi64(a.v, b.v);
  2332. #elif defined (__INTEL_COMPILER)
  2333. return _mm512_mullox_epi64(a.v, b.v); // _mm512_mullox_epi64 missing in gcc
  2334. #else
  2335. // instruction does not exist. Split into 32-bit multiplies
  2336. //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1); // swap H<->L
  2337. __m512i ahigh = _mm512_srli_epi64(a.v, 32); // high 32 bits of each a
  2338. __m512i bhigh = _mm512_srli_epi64(b.v, 32); // high 32 bits of each b
  2339. __m512i prodahb = _mm512_mul_epu32(ahigh, b.v); // ahigh*b
  2340. __m512i prodbha = _mm512_mul_epu32(bhigh, a.v); // bhigh*a
  2341. __m512i prodhl = _mm512_add_epi64(prodahb, prodbha); // sum of high*low products
  2342. __m512i prodhi = _mm512_slli_epi64(prodhl, 32); // same, shifted high
  2343. __m512i prodll = _mm512_mul_epu32(a.v, b.v); // alow*blow = 64 bit unsigned products
  2344. __m512i prod = _mm512_add_epi64(prodll, prodhi); // low*low+(high*low)<<32
  2345. return prod;
  2346. #endif
  2347. }
  2348. template <> inline VecData<float,16> mul_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2349. return _mm512_mul_ps(a.v, b.v);
  2350. }
  2351. template <> inline VecData<double,8> mul_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2352. return _mm512_mul_pd(a.v, b.v);
  2353. }
  2354. template <> inline VecData<float,16> div_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2355. return _mm512_div_ps(a.v, b.v);
  2356. }
  2357. template <> inline VecData<double,8> div_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2358. return _mm512_div_pd(a.v, b.v);
  2359. }
  2360. //template <> inline VecData<int8_t,64> add_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {}
  2361. //template <> inline VecData<int16_t,32> add_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {}
  2362. template <> inline VecData<int32_t,16> add_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2363. return _mm512_add_epi32(a.v, b.v);
  2364. }
  2365. template <> inline VecData<int64_t,8> add_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2366. return _mm512_add_epi64(a.v, b.v);
  2367. }
  2368. template <> inline VecData<float,16> add_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2369. return _mm512_add_ps(a.v, b.v);
  2370. }
  2371. template <> inline VecData<double,8> add_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2372. return _mm512_add_pd(a.v, b.v);
  2373. }
  2374. //template <> inline VecData<int8_t,64> sub_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {}
  2375. //template <> inline VecData<int16_t,32> sub_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {}
  2376. template <> inline VecData<int32_t,16> sub_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2377. return _mm512_sub_epi32(a.v, b.v);
  2378. }
  2379. template <> inline VecData<int64_t,8> sub_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2380. return _mm512_sub_epi64(a.v, b.v);
  2381. }
  2382. template <> inline VecData<float,16> sub_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2383. return _mm512_sub_ps(a.v, b.v);
  2384. }
  2385. template <> inline VecData<double,8> sub_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2386. return _mm512_sub_pd(a.v, b.v);
  2387. }
  2388. //template <> inline VecData<int8_t,64> fma_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b, const VecData<int8_t,64>& c) {}
  2389. //template <> inline VecData<int16_t,32> fma_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b, const VecData<int16_t,32>& c) {}
  2390. //template <> inline VecData<int32_t,16> sub_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b, const VecData<int32_t,16>& c) {}
  2391. //template <> inline VecData<int64_t,8> sub_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b, const VecData<int64_t,8>& c) {}
  2392. template <> inline VecData<float,16> fma_intrin(const VecData<float,16>& a, const VecData<float,16>& b, const VecData<float,16>& c) {
  2393. return _mm512_fmadd_ps(a.v, b.v, c.v);
  2394. }
  2395. template <> inline VecData<double,8> fma_intrin(const VecData<double,8>& a, const VecData<double,8>& b, const VecData<double,8>& c) {
  2396. return _mm512_fmadd_pd(a.v, b.v, c.v);
  2397. }
  2398. // Bitwise operators
  2399. template <> inline VecData<int8_t,64> not_intrin<VecData<int8_t,64>>(const VecData<int8_t,64>& a) {
  2400. return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1));
  2401. }
  2402. template <> inline VecData<int16_t,32> not_intrin<VecData<int16_t,32>>(const VecData<int16_t,32>& a) {
  2403. return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1));
  2404. }
  2405. template <> inline VecData<int32_t,16> not_intrin<VecData<int32_t,16>>(const VecData<int32_t,16>& a) {
  2406. return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1));
  2407. }
  2408. template <> inline VecData<int64_t,8> not_intrin<VecData<int64_t,8>>(const VecData<int64_t,8>& a) {
  2409. return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1));
  2410. }
  2411. template <> inline VecData<float,16> not_intrin<VecData<float,16>>(const VecData<float,16>& a) {
  2412. #ifdef __AVX512DQ__
  2413. return _mm512_xor_ps(a.v, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
  2414. #else
  2415. return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(a.v), _mm512_set1_epi32(-1)));
  2416. #endif
  2417. }
  2418. template <> inline VecData<double,8> not_intrin<VecData<double,8>>(const VecData<double,8>& a) {
  2419. #ifdef __AVX512DQ__
  2420. return _mm512_xor_pd(a.v, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
  2421. #else
  2422. return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(a.v), _mm512_set1_epi32(-1)));
  2423. #endif
  2424. }
  2425. template <> inline VecData<int8_t,64> and_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2426. return _mm512_and_epi32(a.v, b.v);
  2427. }
  2428. template <> inline VecData<int16_t,32> and_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2429. return _mm512_and_epi32(a.v, b.v);
  2430. }
  2431. template <> inline VecData<int32_t,16> and_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2432. return _mm512_and_epi32(a.v, b.v);
  2433. }
  2434. template <> inline VecData<int64_t,8> and_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2435. return _mm512_and_epi32(a.v, b.v);
  2436. }
  2437. template <> inline VecData<float,16> and_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2438. #ifdef __AVX512DQ__
  2439. return _mm512_and_ps(a.v, b.v);
  2440. #else
  2441. return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v)));
  2442. #endif
  2443. }
  2444. template <> inline VecData<double,8> and_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2445. #ifdef __AVX512DQ__
  2446. return _mm512_and_pd(a.v, b.v);
  2447. #else
  2448. return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a.v), _mm512_castpd_si512(b.v)));
  2449. #endif
  2450. }
  2451. template <> inline VecData<int8_t,64> xor_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2452. return _mm512_xor_epi32(a.v, b.v);
  2453. }
  2454. template <> inline VecData<int16_t,32> xor_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2455. return _mm512_xor_epi32(a.v, b.v);
  2456. }
  2457. template <> inline VecData<int32_t,16> xor_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2458. return _mm512_xor_epi32(a.v, b.v);
  2459. }
  2460. template <> inline VecData<int64_t,8> xor_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2461. return _mm512_xor_epi32(a.v, b.v);
  2462. }
  2463. template <> inline VecData<float,16> xor_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2464. #ifdef __AVX512DQ__
  2465. return _mm512_xor_ps(a.v, b.v);
  2466. #else
  2467. return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v)));
  2468. #endif
  2469. }
  2470. template <> inline VecData<double,8> xor_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2471. #ifdef __AVX512DQ__
  2472. return _mm512_xor_pd(a.v, b.v);
  2473. #else
  2474. return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(a.v), _mm512_castpd_si512(b.v)));
  2475. #endif
  2476. }
  2477. template <> inline VecData<int8_t,64> or_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2478. return _mm512_or_epi32(a.v, b.v);
  2479. }
  2480. template <> inline VecData<int16_t,32> or_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2481. return _mm512_or_epi32(a.v, b.v);
  2482. }
  2483. template <> inline VecData<int32_t,16> or_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2484. return _mm512_or_epi32(a.v, b.v);
  2485. }
  2486. template <> inline VecData<int64_t,8> or_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2487. return _mm512_or_epi32(a.v, b.v);
  2488. }
  2489. template <> inline VecData<float,16> or_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2490. #ifdef __AVX512DQ__
  2491. return _mm512_or_ps(a.v, b.v);
  2492. #else
  2493. return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(a.v), _mm512_castps_si512(b.v)));
  2494. #endif
  2495. }
  2496. template <> inline VecData<double,8> or_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2497. #ifdef __AVX512DQ__
  2498. return _mm512_or_pd(a.v, b.v);
  2499. #else
  2500. return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(a.v), _mm512_castpd_si512(b.v)));
  2501. #endif
  2502. }
  2503. template <> inline VecData<int8_t,64> andnot_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2504. return _mm512_andnot_epi32(b.v, a.v);
  2505. }
  2506. template <> inline VecData<int16_t,32> andnot_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2507. return _mm512_andnot_epi32(b.v, a.v);
  2508. }
  2509. template <> inline VecData<int32_t,16> andnot_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2510. return _mm512_andnot_epi32(b.v, a.v);
  2511. }
  2512. template <> inline VecData<int64_t,8> andnot_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2513. return _mm512_andnot_epi32(b.v, a.v);
  2514. }
  2515. template <> inline VecData<float,16> andnot_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2516. #ifdef __AVX512DQ__
  2517. return _mm512_andnot_ps(b.v, a.v);
  2518. #else
  2519. return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(b.v), _mm512_castps_si512(a.v)));
  2520. #endif
  2521. }
  2522. template <> inline VecData<double,8> andnot_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2523. #ifdef __AVX512DQ__
  2524. return _mm512_andnot_pd(b.v, a.v);
  2525. #else
  2526. return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(b.v), _mm512_castpd_si512(a.v)));
  2527. #endif
  2528. }
  2529. // Bitshift
  2530. //template <> inline VecData<int8_t ,64> bitshiftleft_intrin<VecData<int8_t ,64>>(const VecData<int8_t ,64>& a, const Integer& rhs) { }
  2531. template <> inline VecData<int16_t,32> bitshiftleft_intrin<VecData<int16_t,32>>(const VecData<int16_t,32>& a, const Integer& rhs) { return _mm512_slli_epi16(a.v , rhs); }
  2532. template <> inline VecData<int32_t,16> bitshiftleft_intrin<VecData<int32_t,16>>(const VecData<int32_t,16>& a, const Integer& rhs) { return _mm512_slli_epi32(a.v , rhs); }
  2533. template <> inline VecData<int64_t ,8> bitshiftleft_intrin<VecData<int64_t ,8>>(const VecData<int64_t ,8>& a, const Integer& rhs) { return _mm512_slli_epi64(a.v , rhs); }
  2534. template <> inline VecData<float ,16> bitshiftleft_intrin<VecData<float ,16>>(const VecData<float ,16>& a, const Integer& rhs) { return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_castps_si512(a.v), rhs)); }
  2535. template <> inline VecData<double ,8> bitshiftleft_intrin<VecData<double ,8>>(const VecData<double ,8>& a, const Integer& rhs) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a.v), rhs)); }
  2536. //template <> inline VecData<int8_t ,64> bitshiftright_intrin<VecData<int8_t ,64>>(const VecData<int8_t ,64>& a, const Integer& rhs) { }
  2537. template <> inline VecData<int16_t,32> bitshiftright_intrin<VecData<int16_t,32>>(const VecData<int16_t,32>& a, const Integer& rhs) { return _mm512_srli_epi16(a.v , rhs); }
  2538. template <> inline VecData<int32_t,16> bitshiftright_intrin<VecData<int32_t,16>>(const VecData<int32_t,16>& a, const Integer& rhs) { return _mm512_srli_epi32(a.v , rhs); }
  2539. template <> inline VecData<int64_t ,8> bitshiftright_intrin<VecData<int64_t ,8>>(const VecData<int64_t ,8>& a, const Integer& rhs) { return _mm512_srli_epi64(a.v , rhs); }
  2540. template <> inline VecData<float ,16> bitshiftright_intrin<VecData<float ,16>>(const VecData<float ,16>& a, const Integer& rhs) { return _mm512_castsi512_ps(_mm512_srli_epi32(_mm512_castps_si512(a.v), rhs)); }
  2541. template <> inline VecData<double ,8> bitshiftright_intrin<VecData<double ,8>>(const VecData<double ,8>& a, const Integer& rhs) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a.v), rhs)); }
  2542. // Other functions
  2543. #if defined(__AVX512BW__)
  2544. template <> inline VecData<int8_t,64> max_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2545. return _mm512_max_epi8(a.v, b.v);
  2546. }
  2547. template <> inline VecData<int16_t,32> max_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2548. return _mm512_max_epi16(a.v, b.v);
  2549. }
  2550. #endif
  2551. template <> inline VecData<int32_t,16> max_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2552. return _mm512_max_epi32(a.v, b.v);
  2553. }
  2554. template <> inline VecData<int64_t,8> max_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2555. return _mm512_max_epi64(a.v, b.v);
  2556. }
  2557. template <> inline VecData<float,16> max_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2558. return _mm512_max_ps(a.v, b.v);
  2559. }
  2560. template <> inline VecData<double,8> max_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2561. return _mm512_max_pd(a.v, b.v);
  2562. }
  2563. #if defined(__AVX512BW__)
  2564. template <> inline VecData<int8_t,64> min_intrin(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) {
  2565. return _mm512_min_epi8(a.v, b.v);
  2566. }
  2567. template <> inline VecData<int16_t,32> min_intrin(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) {
  2568. return _mm512_min_epi16(a.v, b.v);
  2569. }
  2570. #endif
  2571. template <> inline VecData<int32_t,16> min_intrin(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) {
  2572. return _mm512_min_epi32(a.v, b.v);
  2573. }
  2574. template <> inline VecData<int64_t,8> min_intrin(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) {
  2575. return _mm512_min_epi64(a.v, b.v);
  2576. }
  2577. template <> inline VecData<float,16> min_intrin(const VecData<float,16>& a, const VecData<float,16>& b) {
  2578. return _mm512_min_ps(a.v, b.v);
  2579. }
  2580. template <> inline VecData<double,8> min_intrin(const VecData<double,8>& a, const VecData<double,8>& b) {
  2581. return _mm512_min_pd(a.v, b.v);
  2582. }
  2583. // Conversion operators
  2584. #if defined(__AVX512DQ__)
  2585. template <> inline VecData<float,16> convert_int2real_intrin<VecData<float,16>,VecData<int32_t,16>>(const VecData<int32_t,16>& x) { return _mm512_cvtepi32_ps(x.v); }
  2586. template <> inline VecData<double,8> convert_int2real_intrin<VecData<double,8>,VecData<int64_t, 8>>(const VecData<int64_t, 8>& x) { return _mm512_cvtepi64_pd(x.v); }
  2587. template <> inline VecData<int32_t,16> round_real2int_intrin<VecData<int32_t,16>,VecData<float,16>>(const VecData<float,16>& x) { return _mm512_cvtps_epi32(x.v); }
  2588. template <> inline VecData<int64_t, 8> round_real2int_intrin<VecData<int64_t, 8>,VecData<double,8>>(const VecData<double,8>& x) { return _mm512_cvtpd_epi64(x.v); }
  2589. #endif
  2590. template <> inline VecData<float,16> round_real2real_intrin<VecData<float,16>>(const VecData<float,16>& x) { return _mm512_roundscale_ps(x.v, _MM_FROUND_TO_NEAREST_INT); }
  2591. template <> inline VecData<double,8> round_real2real_intrin<VecData<double,8>>(const VecData<double,8>& x) { return _mm512_roundscale_pd(x.v, _MM_FROUND_TO_NEAREST_INT); }
  2592. /////////////////////////////////////////////////////////////////////////////
  2593. /////////////////////////////////////////////////////////////////////////////
  2594. #if defined(__AVX512BW__)
  2595. template <> struct Mask<VecData<int8_t ,64>> {
  2596. using ScalarType = int8_t;
  2597. static constexpr Integer Size = 64;
  2598. static inline Mask Zero() {
  2599. return Mask(0);
  2600. }
  2601. Mask() = default;
  2602. Mask(const Mask&) = default;
  2603. Mask& operator=(const Mask&) = default;
  2604. ~Mask() = default;
  2605. inline explicit Mask(const __mmask64& v_) : v(v_) {}
  2606. __mmask64 v;
  2607. };
  2608. template <> struct Mask<VecData<int16_t,32>> {
  2609. using ScalarType = int16_t;
  2610. static constexpr Integer Size = 32;
  2611. static inline Mask Zero() {
  2612. return Mask(0);
  2613. }
  2614. Mask() = default;
  2615. Mask(const Mask&) = default;
  2616. Mask& operator=(const Mask&) = default;
  2617. ~Mask() = default;
  2618. inline explicit Mask(const __mmask32& v_) : v(v_) {}
  2619. __mmask32 v;
  2620. };
  2621. #endif
  2622. #if defined(__AVX512DQ__)
  2623. template <> struct Mask<VecData<int32_t,16>> {
  2624. using ScalarType = int32_t;
  2625. static constexpr Integer Size = 16;
  2626. static inline Mask Zero() {
  2627. return Mask(0);
  2628. }
  2629. Mask() = default;
  2630. Mask(const Mask&) = default;
  2631. Mask& operator=(const Mask&) = default;
  2632. ~Mask() = default;
  2633. inline explicit Mask(const __mmask16& v_) : v(v_) {}
  2634. __mmask16 v;
  2635. };
  2636. template <> struct Mask<VecData<int64_t ,8>> {
  2637. using ScalarType = int64_t;
  2638. static constexpr Integer Size = 8;
  2639. static inline Mask Zero() {
  2640. return Mask(0);
  2641. }
  2642. Mask() = default;
  2643. Mask(const Mask&) = default;
  2644. Mask& operator=(const Mask&) = default;
  2645. ~Mask() = default;
  2646. inline explicit Mask(const __mmask8& v_) : v(v_) {}
  2647. __mmask8 v;
  2648. };
  2649. template <> struct Mask<VecData<float ,16>> {
  2650. using ScalarType = float;
  2651. static constexpr Integer Size = 16;
  2652. static inline Mask Zero() {
  2653. return Mask(0);
  2654. }
  2655. Mask() = default;
  2656. Mask(const Mask&) = default;
  2657. Mask& operator=(const Mask&) = default;
  2658. ~Mask() = default;
  2659. inline explicit Mask(const __mmask16& v_) : v(v_) {}
  2660. __mmask16 v;
  2661. };
  2662. template <> struct Mask<VecData<double ,8>> {
  2663. using ScalarType = double;
  2664. static constexpr Integer Size = 8;
  2665. static inline Mask Zero() {
  2666. return Mask(0);
  2667. }
  2668. Mask() = default;
  2669. Mask(const Mask&) = default;
  2670. Mask& operator=(const Mask&) = default;
  2671. ~Mask() = default;
  2672. inline explicit Mask(const __mmask8& v_) : v(v_) {}
  2673. __mmask8 v;
  2674. };
  2675. #endif
  2676. // Bitwise operators
  2677. #if defined(__AVX512BW__)
  2678. template <> inline Mask<VecData<int8_t ,64>> operator~<VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& vec) { return Mask<VecData<int8_t ,64>>(_knot_mask64(vec.v)); }
  2679. template <> inline Mask<VecData<int16_t,32>> operator~<VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& vec) { return Mask<VecData<int16_t,32>>(_knot_mask32(vec.v)); }
  2680. template <> inline Mask<VecData<int8_t ,64>> operator&<VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& a, const Mask<VecData<int8_t ,64>>& b) { return Mask<VecData<int8_t ,64>>(_kand_mask64(a.v,b.v)); }
  2681. template <> inline Mask<VecData<int16_t,32>> operator&<VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& a, const Mask<VecData<int16_t,32>>& b) { return Mask<VecData<int16_t,32>>(_kand_mask32(a.v,b.v)); }
  2682. template <> inline Mask<VecData<int8_t ,64>> operator^<VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& a, const Mask<VecData<int8_t ,64>>& b) { return Mask<VecData<int8_t ,64>>(_kxor_mask64(a.v,b.v)); }
  2683. template <> inline Mask<VecData<int16_t,32>> operator^<VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& a, const Mask<VecData<int16_t,32>>& b) { return Mask<VecData<int16_t,32>>(_kxor_mask32(a.v,b.v)); }
  2684. template <> inline Mask<VecData<int8_t ,64>> operator|<VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& a, const Mask<VecData<int8_t ,64>>& b) { return Mask<VecData<int8_t ,64>>(_kor_mask64(a.v,b.v)); }
  2685. template <> inline Mask<VecData<int16_t,32>> operator|<VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& a, const Mask<VecData<int16_t,32>>& b) { return Mask<VecData<int16_t,32>>(_kor_mask32(a.v,b.v)); }
  2686. template <> inline Mask<VecData<int8_t ,64>> AndNot <VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& a, const Mask<VecData<int8_t ,64>>& b) { return Mask<VecData<int8_t ,64>>(_kandn_mask64(b.v,a.v)); }
  2687. template <> inline Mask<VecData<int16_t,32>> AndNot <VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& a, const Mask<VecData<int16_t,32>>& b) { return Mask<VecData<int16_t,32>>(_kandn_mask32(b.v,a.v)); }
  2688. template <> inline VecData<int8_t ,64> convert_mask2vec_intrin<VecData<int8_t ,64>>(const Mask<VecData<int8_t ,64>>& a) { return _mm512_movm_epi8 (a.v); }
  2689. template <> inline VecData<int16_t,32> convert_mask2vec_intrin<VecData<int16_t,32>>(const Mask<VecData<int16_t,32>>& a) { return _mm512_movm_epi16(a.v); }
  2690. template <> inline Mask<VecData<int8_t ,64>> convert_vec2mask_intrin<VecData<int8_t ,64>>(const VecData<int8_t ,64>& a) { return Mask<VecData<int8_t ,64>>(_mm512_movepi8_mask (a.v)); }
  2691. template <> inline Mask<VecData<int16_t,32>> convert_vec2mask_intrin<VecData<int16_t,32>>(const VecData<int16_t,32>& a) { return Mask<VecData<int16_t,32>>(_mm512_movepi16_mask(a.v)); }
  2692. #endif
  2693. #if defined(__AVX512DQ__)
  2694. template <> inline Mask<VecData<int32_t,16>> operator~<VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& vec) { return Mask<VecData<int32_t,16>>(_knot_mask16(vec.v)); }
  2695. template <> inline Mask<VecData<int64_t ,8>> operator~<VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& vec) { return Mask<VecData<int64_t ,8>>(_knot_mask8 (vec.v)); }
  2696. template <> inline Mask<VecData<float ,16>> operator~<VecData<float ,16>>(const Mask<VecData<float ,16>>& vec) { return Mask<VecData<float ,16>>(_knot_mask16(vec.v)); }
  2697. template <> inline Mask<VecData<double ,8>> operator~<VecData<double ,8>>(const Mask<VecData<double ,8>>& vec) { return Mask<VecData<double ,8>>(_knot_mask8 (vec.v)); }
  2698. template <> inline Mask<VecData<int32_t,16>> operator&<VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& a, const Mask<VecData<int32_t,16>>& b) { return Mask<VecData<int32_t,16>>(_kand_mask16(a.v,b.v)); }
  2699. template <> inline Mask<VecData<int64_t ,8>> operator&<VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& a, const Mask<VecData<int64_t ,8>>& b) { return Mask<VecData<int64_t ,8>>(_kand_mask8 (a.v,b.v)); }
  2700. template <> inline Mask<VecData<float ,16>> operator&<VecData<float ,16>>(const Mask<VecData<float ,16>>& a, const Mask<VecData<float ,16>>& b) { return Mask<VecData<float ,16>>(_kand_mask16(a.v,b.v)); }
  2701. template <> inline Mask<VecData<double ,8>> operator&<VecData<double ,8>>(const Mask<VecData<double ,8>>& a, const Mask<VecData<double ,8>>& b) { return Mask<VecData<double ,8>>(_kand_mask8 (a.v,b.v)); }
  2702. template <> inline Mask<VecData<int32_t,16>> operator^<VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& a, const Mask<VecData<int32_t,16>>& b) { return Mask<VecData<int32_t,16>>(_kxor_mask16(a.v,b.v)); }
  2703. template <> inline Mask<VecData<int64_t ,8>> operator^<VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& a, const Mask<VecData<int64_t ,8>>& b) { return Mask<VecData<int64_t ,8>>(_kxor_mask8 (a.v,b.v)); }
  2704. template <> inline Mask<VecData<float ,16>> operator^<VecData<float ,16>>(const Mask<VecData<float ,16>>& a, const Mask<VecData<float ,16>>& b) { return Mask<VecData<float ,16>>(_kxor_mask16(a.v,b.v)); }
  2705. template <> inline Mask<VecData<double ,8>> operator^<VecData<double ,8>>(const Mask<VecData<double ,8>>& a, const Mask<VecData<double ,8>>& b) { return Mask<VecData<double ,8>>(_kxor_mask8 (a.v,b.v)); }
  2706. template <> inline Mask<VecData<int32_t,16>> operator|<VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& a, const Mask<VecData<int32_t,16>>& b) { return Mask<VecData<int32_t,16>>(_kor_mask16(a.v,b.v)); }
  2707. template <> inline Mask<VecData<int64_t ,8>> operator|<VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& a, const Mask<VecData<int64_t ,8>>& b) { return Mask<VecData<int64_t ,8>>(_kor_mask8 (a.v,b.v)); }
  2708. template <> inline Mask<VecData<float ,16>> operator|<VecData<float ,16>>(const Mask<VecData<float ,16>>& a, const Mask<VecData<float ,16>>& b) { return Mask<VecData<float ,16>>(_kor_mask16(a.v,b.v)); }
  2709. template <> inline Mask<VecData<double ,8>> operator|<VecData<double ,8>>(const Mask<VecData<double ,8>>& a, const Mask<VecData<double ,8>>& b) { return Mask<VecData<double ,8>>(_kor_mask8 (a.v,b.v)); }
  2710. template <> inline Mask<VecData<int32_t,16>> AndNot <VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& a, const Mask<VecData<int32_t,16>>& b) { return Mask<VecData<int32_t,16>>(_kandn_mask16(b.v,a.v)); }
  2711. template <> inline Mask<VecData<int64_t ,8>> AndNot <VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& a, const Mask<VecData<int64_t ,8>>& b) { return Mask<VecData<int64_t ,8>>(_kandn_mask8 (b.v,a.v)); }
  2712. template <> inline Mask<VecData<float ,16>> AndNot <VecData<float ,16>>(const Mask<VecData<float ,16>>& a, const Mask<VecData<float ,16>>& b) { return Mask<VecData<float ,16>>(_kandn_mask16(b.v,a.v)); }
  2713. template <> inline Mask<VecData<double ,8>> AndNot <VecData<double ,8>>(const Mask<VecData<double ,8>>& a, const Mask<VecData<double ,8>>& b) { return Mask<VecData<double ,8>>(_kandn_mask8 (b.v,a.v)); }
  2714. template <> inline VecData<int32_t,16> convert_mask2vec_intrin<VecData<int32_t,16>>(const Mask<VecData<int32_t,16>>& a) { return _mm512_movm_epi32(a.v); }
  2715. template <> inline VecData<int64_t ,8> convert_mask2vec_intrin<VecData<int64_t ,8>>(const Mask<VecData<int64_t ,8>>& a) { return _mm512_movm_epi64(a.v); }
  2716. template <> inline VecData<float ,16> convert_mask2vec_intrin<VecData<float ,16>>(const Mask<VecData<float ,16>>& a) { return _mm512_castsi512_ps(_mm512_movm_epi32(a.v)); }
  2717. template <> inline VecData<double ,8> convert_mask2vec_intrin<VecData<double ,8>>(const Mask<VecData<double ,8>>& a) { return _mm512_castsi512_pd(_mm512_movm_epi64(a.v)); }
  2718. template <> inline Mask<VecData<int32_t,16>> convert_vec2mask_intrin<VecData<int32_t,16>>(const VecData<int32_t,16>& a) { return Mask<VecData<int32_t,16>>(_mm512_movepi32_mask(a.v)); }
  2719. template <> inline Mask<VecData<int64_t ,8>> convert_vec2mask_intrin<VecData<int64_t ,8>>(const VecData<int64_t ,8>& a) { return Mask<VecData<int64_t ,8>>(_mm512_movepi64_mask(a.v)); }
  2720. template <> inline Mask<VecData<float ,16>> convert_vec2mask_intrin<VecData<float ,16>>(const VecData<float ,16>& a) { return Mask<VecData<float ,16>>(_mm512_movepi32_mask(_mm512_castps_si512(a.v))); }
  2721. template <> inline Mask<VecData<double ,8>> convert_vec2mask_intrin<VecData<double ,8>>(const VecData<double ,8>& a) { return Mask<VecData<double ,8>>(_mm512_movepi64_mask(_mm512_castpd_si512(a.v))); }
  2722. #endif
  2723. /////////////////////////////////////////////////////////////////////////////
  2724. /////////////////////////////////////////////////////////////////////////////
  2725. // Comparison operators
  2726. #if defined(__AVX512BW__)
  2727. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::lt>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LT)); }
  2728. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::le>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LE)); }
  2729. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::gt>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NLE));}
  2730. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::ge>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NLT));}
  2731. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::eq>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ)); }
  2732. template <> inline Mask<VecData<int8_t,64>> comp_intrin<ComparisonType::ne>(const VecData<int8_t,64>& a, const VecData<int8_t,64>& b) { return Mask<VecData<int8_t,64>>(_mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE)); }
  2733. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::lt>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_LT)); }
  2734. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::le>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_LE)); }
  2735. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::gt>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_NLE));}
  2736. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::ge>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_NLT));}
  2737. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::eq>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_EQ)); }
  2738. template <> inline Mask<VecData<int16_t,32>> comp_intrin<ComparisonType::ne>(const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return Mask<VecData<int16_t,32>>(_mm512_cmp_epi16_mask(a.v, b.v, _MM_CMPINT_NE)); }
  2739. #endif
  2740. #if defined(__AVX512DQ__)
  2741. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::lt>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LT)); }
  2742. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::le>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LE)); }
  2743. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::gt>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NLE));}
  2744. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::ge>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NLT));}
  2745. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::eq>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_EQ)); }
  2746. template <> inline Mask<VecData<int32_t,16>> comp_intrin<ComparisonType::ne>(const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return Mask<VecData<int32_t,16>>(_mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NE)); }
  2747. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::lt>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_LT)); }
  2748. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::le>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_LE)); }
  2749. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::gt>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_NLE));}
  2750. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::ge>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_NLT));}
  2751. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::eq>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_EQ)); }
  2752. template <> inline Mask<VecData<int64_t,8>> comp_intrin<ComparisonType::ne>(const VecData<int64_t,8>& a, const VecData<int64_t,8>& b) { return Mask<VecData<int64_t,8>>(_mm512_cmp_epi64_mask(a.v, b.v, _MM_CMPINT_NE)); }
  2753. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::lt>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_LT_OS)); }
  2754. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::le>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_LE_OS)); }
  2755. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::gt>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_GT_OS)); }
  2756. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::ge>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_GE_OS)); }
  2757. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::eq>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_EQ_OS)); }
  2758. template <> inline Mask<VecData<float,16>> comp_intrin<ComparisonType::ne>(const VecData<float,16>& a, const VecData<float,16>& b) { return Mask<VecData<float,16>>(_mm512_cmp_ps_mask(a.v, b.v, _CMP_NEQ_OS));}
  2759. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::lt>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_LT_OS)); }
  2760. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::le>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_LE_OS)); }
  2761. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::gt>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_GT_OS)); }
  2762. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::ge>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_GE_OS)); }
  2763. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::eq>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_EQ_OS)); }
  2764. template <> inline Mask<VecData<double,8>> comp_intrin<ComparisonType::ne>(const VecData<double,8>& a, const VecData<double,8>& b) { return Mask<VecData<double,8>>(_mm512_cmp_pd_mask(a.v, b.v, _CMP_NEQ_OS));}
  2765. #endif
  2766. #if defined(__AVX512BW__)
  2767. template <> inline VecData<int8_t ,64> select_intrin(const Mask<VecData<int8_t ,64>>& s, const VecData<int8_t ,64>& a, const VecData<int8_t ,64>& b) { return _mm512_mask_blend_epi8 (s.v, b.v, a.v); }
  2768. template <> inline VecData<int16_t,32> select_intrin(const Mask<VecData<int16_t,32>>& s, const VecData<int16_t,32>& a, const VecData<int16_t,32>& b) { return _mm512_mask_blend_epi16(s.v, b.v, a.v); }
  2769. #endif
  2770. #if defined(__AVX512DQ__)
  2771. template <> inline VecData<int32_t,16> select_intrin(const Mask<VecData<int32_t,16>>& s, const VecData<int32_t,16>& a, const VecData<int32_t,16>& b) { return _mm512_mask_blend_epi32(s.v, b.v, a.v); }
  2772. template <> inline VecData<int64_t ,8> select_intrin(const Mask<VecData<int64_t ,8>>& s, const VecData<int64_t ,8>& a, const VecData<int64_t ,8>& b) { return _mm512_mask_blend_epi64(s.v, b.v, a.v); }
  2773. template <> inline VecData<float ,16> select_intrin(const Mask<VecData<float ,16>>& s, const VecData<float ,16>& a, const VecData<float ,16>& b) { return _mm512_mask_blend_ps (s.v, b.v, a.v); }
  2774. template <> inline VecData<double ,8> select_intrin(const Mask<VecData<double ,8>>& s, const VecData<double ,8>& a, const VecData<double ,8>& b) { return _mm512_mask_blend_pd (s.v, b.v, a.v); }
  2775. #endif
  2776. // Special functions
  2777. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<float,16>> {
  2778. static constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2779. static inline VecData<float,16> eval(const VecData<float,16>& a) {
  2780. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,16>>::eval(_mm512_rsqrt14_ps(a.v), a.v);
  2781. }
  2782. static inline VecData<float,16> eval(const VecData<float,16>& a, const Mask<VecData<float,16>>& m) {
  2783. #if defined(__AVX512DQ__)
  2784. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,16>>::eval(_mm512_maskz_rsqrt14_ps(m.v, a.v), a.v);
  2785. #else
  2786. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<float,16>>::eval(and_intrin(VecData<float,16>(_mm512_rsqrt14_ps(a.v)), convert_mask2vec_intrin(m)), a.v);
  2787. #endif
  2788. }
  2789. };
  2790. template <Integer digits> struct rsqrt_approx_intrin<digits, VecData<double,8>> {
  2791. static constexpr Integer newton_iter = mylog2((Integer)(digits/4.2144199393));
  2792. static inline VecData<double,8> eval(const VecData<double,8>& a) {
  2793. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,8>>::eval(_mm512_rsqrt14_pd(a.v), a.v);
  2794. }
  2795. static inline VecData<double,8> eval(const VecData<double,8>& a, const Mask<VecData<double,8>>& m) {
  2796. #if defined(__AVX512DQ__)
  2797. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,8>>::eval(_mm512_maskz_rsqrt14_pd(m.v, a.v), a.v);
  2798. #else
  2799. return rsqrt_newton_iter<newton_iter,newton_iter,VecData<double,8>>::eval(and_intrin(VecData<double,8>(_mm512_rsqrt14_pd(a.v)), convert_mask2vec_intrin(m)), a.v);
  2800. #endif
  2801. }
  2802. };
  2803. #ifdef SCTL_HAVE_SVML
  2804. template <> inline void sincos_intrin<VecData<float,16>>(VecData<float,16>& sinx, VecData<float,16>& cosx, const VecData<float,16>& x) { sinx = _mm512_sincos_ps(&cosx.v, x.v); }
  2805. template <> inline void sincos_intrin<VecData<double,8>>(VecData<double,8>& sinx, VecData<double,8>& cosx, const VecData<double,8>& x) { sinx = _mm512_sincos_pd(&cosx.v, x.v); }
  2806. template <> inline VecData<float,16> exp_intrin<VecData<float,16>>(const VecData<float,16>& x) { return _mm512_exp_ps(x.v); }
  2807. template <> inline VecData<double,8> exp_intrin<VecData<double,8>>(const VecData<double,8>& x) { return _mm512_exp_pd(x.v); }
  2808. #else
  2809. template <> inline void sincos_intrin<VecData<float,16>>(VecData<float,16>& sinx, VecData<float,16>& cosx, const VecData<float,16>& x) {
  2810. approx_sincos_intrin<(Integer)(TypeTraits<float>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  2811. }
  2812. template <> inline void sincos_intrin<VecData<double,8>>(VecData<double,8>& sinx, VecData<double,8>& cosx, const VecData<double,8>& x) {
  2813. approx_sincos_intrin<(Integer)(TypeTraits<double>::SigBits/3.2)>(sinx, cosx, x); // TODO: determine constants more precisely
  2814. }
  2815. template <> inline VecData<float,16> exp_intrin<VecData<float,16>>(const VecData<float,16>& x) {
  2816. return approx_exp_intrin<(Integer)(TypeTraits<float>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  2817. }
  2818. template <> inline VecData<double,8> exp_intrin<VecData<double,8>>(const VecData<double,8>& x) {
  2819. return approx_exp_intrin<(Integer)(TypeTraits<double>::SigBits/3.8)>(x); // TODO: determine constants more precisely
  2820. }
  2821. #endif
  2822. #endif
  2823. }
  2824. #endif //_SCTL_INTRIN_WRAPPER_HPP_