Line data Source code
1 : /* This file is part of the Vc library.
2 :
3 : Copyright (C) 2009-2012 Matthias Kretz <kretz@kde.org>
4 :
5 : Vc is free software: you can redistribute it and/or modify
6 : it under the terms of the GNU Lesser General Public License as
7 : published by the Free Software Foundation, either version 3 of
8 : the License, or (at your option) any later version.
9 :
10 : Vc is distributed in the hope that it will be useful, but
11 : WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU Lesser General Public License for more details.
14 :
15 : You should have received a copy of the GNU Lesser General Public
16 : License along with Vc. If not, see <http://www.gnu.org/licenses/>.
17 :
18 : */
19 :
20 : #ifndef V_ALIGN
21 : # ifdef __GNUC__
22 : # define V_ALIGN(n) __attribute__((aligned(n)))
23 : # else
24 : # define V_ALIGN(n) __declspec(align(n))
25 : # endif
26 : #endif
27 :
28 : #include "Vc/avx/const_data.h"
29 : #include "Vc/sse/const_data.h"
30 : #include <Vc/version.h>
31 :
32 : #include <cstdio>
33 : #include <cstdlib>
34 : #include <cstring>
35 :
36 : #include "Vc/common/macros.h"
37 :
38 : namespace AliRoot {
39 : namespace Vc
40 : {
41 : namespace AVX
42 : {
43 : // cacheline 1
44 : V_ALIGN(64) extern const unsigned int _IndexesFromZero32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
45 : V_ALIGN(16) extern const unsigned short _IndexesFromZero16[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
46 : V_ALIGN(16) extern const unsigned char _IndexesFromZero8 [16]= { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
47 :
48 : template<> const double c_trig<double>::data[] = {
49 : // cacheline 4
50 : Vc_buildDouble(1, 0x921fb54442d18ull, -1), // π/4
51 : Vc_buildDouble(1, 0x921fb40000000ull, -1), // π/4 - 30bits precision
52 : Vc_buildDouble(1, 0x4442d00000000ull, -25), // π/4 remainder1 - 32bits precision
53 : Vc_buildDouble(1, 0x8469898cc5170ull, -49), // π/4 remainder2
54 : 0.0625,
55 : 16.,
56 : 0., // padding
57 : 0., // padding
58 : // cacheline 5
59 : Vc_buildDouble( 1, 0x555555555554bull, -5), // ~ 1/4!
60 : Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10), // ~-1/6!
61 : Vc_buildDouble( 1, 0xa01a019c844f5ull, -16), // ~ 1/8!
62 : Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22), // ~-1/10!
63 : Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29), // ~ 1/12!
64 : Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37), // ~-1/14!
65 : Vc_buildDouble(-1, 0x5555555555548ull, -3), // ~-1/3!
66 : Vc_buildDouble( 1, 0x111111110f7d0ull, -7), // ~ 1/5!
67 : // cacheline 8
68 : Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13), // ~-1/7!
69 : Vc_buildDouble( 1, 0x71de3567d48a1ull, -19), // ~ 1/9!
70 : Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26), // ~-1/11!
71 : Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33), // ~ 1/13!
72 : 0., // padding (for alignment with float)
73 : Vc_buildDouble(1, 0x8BE60DB939105ull, 0), // 4/π
74 : Vc_buildDouble(1, 0x921fb54442d18ull, 0), // π/2
75 : Vc_buildDouble(1, 0x921fb54442d18ull, 1), // π
76 : // cacheline 10
77 : Vc_buildDouble(-1, 0xc007fa1f72594ull, -1), // atan P coefficients
78 : Vc_buildDouble(-1, 0x028545b6b807aull, 4), // atan P coefficients
79 : Vc_buildDouble(-1, 0x2c08c36880273ull, 6), // atan P coefficients
80 : Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6), // atan P coefficients
81 : Vc_buildDouble(-1, 0x03669fd28ec8eull, 6), // atan P coefficients
82 : Vc_buildDouble( 1, 0x8dbc45b14603cull, 4), // atan Q coefficients
83 : Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7), // atan Q coefficients
84 : Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8), // atan Q coefficients
85 : // cacheline 12
86 : Vc_buildDouble( 1, 0xe563f13b049eaull, 8), // atan Q coefficients
87 : Vc_buildDouble( 1, 0x8519efbbd62ecull, 7), // atan Q coefficients
88 : Vc_buildDouble( 1, 0x3504f333f9de6ull, 1), // tan( 3/8 π )
89 : 0.66, // lower threshold for special casing in atan
90 : Vc_buildDouble(1, 0x1A62633145C07ull, -54), // remainder of pi/2
91 : 1.e-8, // small asin input threshold
92 : 0.625, // large asin input threshold
93 : 0., // padding
94 : // cacheline 14
95 : Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9), // asinCoeff0
96 : Vc_buildDouble(-1, 0x2079259f9290full, -1), // asinCoeff0
97 : Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2), // asinCoeff0
98 : Vc_buildDouble(-1, 0x991aaac01ab68ull, 4), // asinCoeff0
99 : Vc_buildDouble( 1, 0xc896240f3081dull, 4), // asinCoeff0
100 : Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4), // asinCoeff1
101 : Vc_buildDouble( 1, 0x26219af6a7f42ull, 7), // asinCoeff1
102 : Vc_buildDouble(-1, 0x7fe08959063eeull, 8), // asinCoeff1
103 : // cacheline 16
104 : Vc_buildDouble( 1, 0x56709b0b644beull, 8), // asinCoeff1
105 : Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8), // asinCoeff2
106 : Vc_buildDouble(-1, 0x34341333e5c16ull, -1), // asinCoeff2
107 : Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2), // asinCoeff2
108 : Vc_buildDouble(-1, 0x04331de27907bull, 4), // asinCoeff2
109 : Vc_buildDouble( 1, 0x39007da779259ull, 4), // asinCoeff2
110 : Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3), // asinCoeff2
111 : Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3), // asinCoeff3
112 : // cacheline 18
113 : Vc_buildDouble( 1, 0x19fc025fe9054ull, 6), // asinCoeff3
114 : Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7), // asinCoeff3
115 : Vc_buildDouble( 1, 0x1705684ffbf9dull, 7), // asinCoeff3
116 : Vc_buildDouble(-1, 0x898220a3607acull, 5), // asinCoeff3
117 : };
118 : #define _4(x) x
119 : template<> const float c_trig<float>::data[] = {
120 : // cacheline
121 : _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4
122 : _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision
123 : _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision
124 : _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2
125 : _4(0.0625f),
126 : _4(16.f),
127 : _4(0.f), // padding
128 : _4(0.f), // padding
129 : _4(4.166664568298827e-2f), // ~ 1/4!
130 : _4(-1.388731625493765e-3f), // ~-1/6!
131 : _4(2.443315711809948e-5f), // ~ 1/8!
132 : _4(0.f), // padding (for alignment with double)
133 : _4(0.f), // padding (for alignment with double)
134 : _4(0.f), // padding (for alignment with double)
135 : _4(-1.6666654611e-1f), // ~-1/3!
136 : _4(8.3321608736e-3f), // ~ 1/5!
137 : // cacheline
138 : _4(-1.9515295891e-4f), // ~-1/7!
139 : _4(0.f), // padding (for alignment with double)
140 : _4(0.f), // padding (for alignment with double)
141 : _4(0.f), // padding (for alignment with double)
142 : _4(8192.f), // loss threshold
143 : _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π
144 : _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2
145 : _4(Vc_buildFloat(1, 0x490FDB, 1)), // π
146 : _4(8.05374449538e-2f), // atan P coefficients
147 : _4(1.38776856032e-1f), // atan P coefficients
148 : _4(1.99777106478e-1f), // atan P coefficients
149 : _4(3.33329491539e-1f), // atan P coefficients
150 : _4(0.f), // padding (for alignment with double)
151 : _4(0.f), // padding (for alignment with double)
152 : _4(0.f), // padding (for alignment with double)
153 : _4(0.f), // padding (for alignment with double)
154 : // cacheline
155 : _4(0.f), // padding (for alignment with double)
156 : _4(0.f), // padding (for alignment with double)
157 : _4(2.414213562373095f), // tan( 3/8 π )
158 : _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan
159 : _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2
160 : _4(1.e-4f), // small asin input threshold
161 : _4(0.f), // padding (for alignment with double)
162 : _4(0.f), // padding (for alignment with double)
163 : _4(4.2163199048e-2f), // asinCoeff0
164 : _4(2.4181311049e-2f), // asinCoeff0
165 : _4(4.5470025998e-2f), // asinCoeff0
166 : _4(7.4953002686e-2f), // asinCoeff0
167 : _4(1.6666752422e-1f), // asinCoeff0
168 : _4(0.f), // padding (for alignment with double)
169 : _4(0.f), // padding (for alignment with double)
170 : _4(0.f), // padding (for alignment with double)
171 : // cacheline
172 : _4(0.f), // padding (for alignment with double)
173 : _4(0.f), // padding (for alignment with double)
174 : _4(0.f), // padding (for alignment with double)
175 : _4(0.f), // padding (for alignment with double)
176 : _4(0.f), // padding (for alignment with double)
177 : _4(0.f), // padding (for alignment with double)
178 : _4(0.f), // padding (for alignment with double)
179 : _4(0.f), // padding (for alignment with double)
180 : _4(0.f), // padding (for alignment with double)
181 : _4(0.f), // padding (for alignment with double)
182 : _4(0.f), // padding (for alignment with double)
183 : _4(0.f), // padding (for alignment with double)
184 : };
185 : #undef _4
186 :
187 : const unsigned int c_general::absMaskFloat[2] = { 0xffffffffu, 0x7fffffffu };
188 : const unsigned int c_general::signMaskFloat[2] = { 0x0u, 0x80000000u };
189 : const unsigned int c_general::highMaskFloat = 0xfffff000u;
190 : const float c_general::oneFloat = 1.f;
191 : const unsigned short c_general::minShort[2] = { 0x8000u, 0x8000u };
192 : const unsigned short c_general::one16[2] = { 1, 1 };
193 : const float c_general::_2power31 = 1u << 31;
194 :
195 : // cacheline 4
196 : const unsigned long long c_general::highMaskDouble = 0xfffffffff8000000ull;
197 : const double c_general::oneDouble = 1.;
198 : const unsigned long long c_general::frexpMask = 0xbfefffffffffffffull;
199 :
200 : const unsigned long long c_log<double>::data[21] = {
201 : 0x000003ff000003ffull // bias TODO: remove
202 : , 0x7ff0000000000000ull // exponentMask (+inf)
203 :
204 : , 0x3f1ab4c293c31bb0ull // P[0]
205 : , 0x3fdfd6f53f5652f2ull // P[1]
206 : , 0x4012d2baed926911ull // P[2]
207 : , 0x402cff72c63eeb2eull // P[3]
208 : , 0x4031efd6924bc84dull // P[4]
209 : , 0x401ed5637d7edcf8ull // P[5]
210 :
211 : , 0x40269320ae97ef8eull // Q[0]
212 : , 0x40469d2c4e19c033ull // Q[1]
213 : , 0x4054bf33a326bdbdull // Q[2]
214 : , 0x4051c9e2eb5eae21ull // Q[3]
215 : , 0x4037200a9e1f25b2ull // Q[4]
216 :
217 : , 0xfff0000000000000ull // -inf
218 : , 0x0010000000000000ull // min()
219 : , 0x3fe6a09e667f3bcdull // 1/sqrt(2)
220 : , 0x3fe6300000000000ull // round(ln(2) * 512) / 512
221 : , 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512
222 : , 0x3fe0000000000000ull // 0.5
223 : , 0x3fdbcb7b1526e50eull // log10(e)
224 : , 0x3ff71547652b82feull // log2(e)
225 : };
226 :
227 : template<> const unsigned int c_log<float>::data[21] = {
228 : 0x0000007fu // bias TODO: remove
229 : , 0x7f800000u // exponentMask (+inf)
230 :
231 : , 0x3d9021bbu // 7.0376836292e-2f // P[0]
232 : , 0xbdebd1b8u // -1.1514610310e-1f // P[1]
233 : , 0x3def251au // 1.1676998740e-1f // P[2]
234 : , 0xbdfe5d4fu // -1.2420140846e-1f // P[3]
235 : , 0x3e11e9bfu // 1.4249322787e-1f // P[4]
236 : , 0xbe2aae50u // -1.6668057665e-1f // P[5]
237 : , 0x3e4cceacu // 2.0000714765e-1f // P[6]
238 : , 0xbe7ffffcu // -2.4999993993e-1f // P[7]
239 : , 0x3eaaaaaau // 3.3333331174e-1f // P[8]
240 : , 0 // padding because of c_log<double>
241 : , 0 // padding because of c_log<double>
242 :
243 : , 0xff800000u // -inf
244 : , 0x00800000u // min()
245 : , 0x3f3504f3u // 1/sqrt(2)
246 : , 0x3f318000u // round(ln(2) * 512) / 512
247 : , 0xb95e8083u // ln(2) - round(ln(2) * 512) / 512
248 : , 0x3f000000u // 0.5
249 : , 0x3ede5bd9u // log10(e)
250 : , 0x3fb8aa3bu // log2(e)
251 : };
252 : } // namespace AVX
253 :
254 : namespace SSE
255 : {
256 : // cacheline 1
257 : V_ALIGN(64) const int c_general::absMaskFloat[4] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
258 : V_ALIGN(16) const unsigned int c_general::signMaskFloat[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
259 : V_ALIGN(16) const unsigned int c_general::highMaskFloat[4] = { 0xfffff000u, 0xfffff000u, 0xfffff000u, 0xfffff000u };
260 : V_ALIGN(16) const short c_general::minShort[8] = { -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000, -0x8000 };
261 : V_ALIGN(16) extern const unsigned short _IndexesFromZero8[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
262 :
263 : // cacheline 2
264 : V_ALIGN(16) extern const unsigned int _IndexesFromZero4[4] = { 0, 1, 2, 3 };
265 : V_ALIGN(16) const unsigned short c_general::one16[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
266 : V_ALIGN(16) const unsigned int c_general::one32[4] = { 1, 1, 1, 1 };
267 : V_ALIGN(16) const float c_general::oneFloat[4] = { 1.f, 1.f, 1.f, 1.f };
268 :
269 : // cacheline 3
270 : V_ALIGN(16) const unsigned long long c_general::highMaskDouble[2] = { 0xfffffffff8000000ull, 0xfffffffff8000000ull };
271 : V_ALIGN(16) const double c_general::oneDouble[2] = { 1., 1. };
272 : V_ALIGN(16) const long long c_general::absMaskDouble[2] = { 0x7fffffffffffffffll, 0x7fffffffffffffffll };
273 : V_ALIGN(16) const unsigned long long c_general::signMaskDouble[2] = { 0x8000000000000000ull, 0x8000000000000000ull };
274 : V_ALIGN(16) const unsigned long long c_general::frexpMask[2] = { 0xbfefffffffffffffull, 0xbfefffffffffffffull };
275 :
276 : #define _2(x) x, x
277 : template<> const double c_trig<double>::data[] = {
278 : // cacheline 4
279 : _2(Vc_buildDouble(1, 0x921fb54442d18ull, -1)), // π/4
280 : _2(Vc_buildDouble(1, 0x921fb40000000ull, -1)), // π/4 - 30bits precision
281 : _2(Vc_buildDouble(1, 0x4442d00000000ull, -25)), // π/4 remainder1 - 32bits precision
282 : _2(Vc_buildDouble(1, 0x8469898cc5170ull, -49)), // π/4 remainder2
283 : // cacheline 5
284 : _2(0.0625),
285 : _2(16.),
286 : _2(0.), // padding
287 : _2(0.), // padding
288 : // cacheline 6
289 : _2(Vc_buildDouble( 1, 0x555555555554bull, -5)), // ~ 1/4!
290 : _2(Vc_buildDouble(-1, 0x6c16c16c14f91ull, -10)), // ~-1/6!
291 : _2(Vc_buildDouble( 1, 0xa01a019c844f5ull, -16)), // ~ 1/8!
292 : _2(Vc_buildDouble(-1, 0x27e4f7eac4bc6ull, -22)), // ~-1/10!
293 : // cacheline 7
294 : _2(Vc_buildDouble( 1, 0x1ee9d7b4e3f05ull, -29)), // ~ 1/12!
295 : _2(Vc_buildDouble(-1, 0x8fa49a0861a9bull, -37)), // ~-1/14!
296 : _2(Vc_buildDouble(-1, 0x5555555555548ull, -3)), // ~-1/3!
297 : _2(Vc_buildDouble( 1, 0x111111110f7d0ull, -7)), // ~ 1/5!
298 : // cacheline 8
299 : _2(Vc_buildDouble(-1, 0xa01a019bfdf03ull, -13)), // ~-1/7!
300 : _2(Vc_buildDouble( 1, 0x71de3567d48a1ull, -19)), // ~ 1/9!
301 : _2(Vc_buildDouble(-1, 0xae5e5a9291f5dull, -26)), // ~-1/11!
302 : _2(Vc_buildDouble( 1, 0x5d8fd1fd19ccdull, -33)), // ~ 1/13!
303 : // cacheline 9
304 : _2(0.), // padding (for alignment with float)
305 : _2(Vc_buildDouble(1, 0x8BE60DB939105ull, 0)), // 4/π
306 : _2(Vc_buildDouble(1, 0x921fb54442d18ull, 0)), // π/2
307 : _2(Vc_buildDouble(1, 0x921fb54442d18ull, 1)), // π
308 : // cacheline 10
309 : _2(Vc_buildDouble(-1, 0xc007fa1f72594ull, -1)), // atan P coefficients
310 : _2(Vc_buildDouble(-1, 0x028545b6b807aull, 4)), // atan P coefficients
311 : _2(Vc_buildDouble(-1, 0x2c08c36880273ull, 6)), // atan P coefficients
312 : _2(Vc_buildDouble(-1, 0xeb8bf2d05ba25ull, 6)), // atan P coefficients
313 : // cacheline 11
314 : _2(Vc_buildDouble(-1, 0x03669fd28ec8eull, 6)), // atan P coefficients
315 : _2(Vc_buildDouble( 1, 0x8dbc45b14603cull, 4)), // atan Q coefficients
316 : _2(Vc_buildDouble( 1, 0x4a0dd43b8fa25ull, 7)), // atan Q coefficients
317 : _2(Vc_buildDouble( 1, 0xb0e18d2e2be3bull, 8)), // atan Q coefficients
318 : // cacheline 12
319 : _2(Vc_buildDouble( 1, 0xe563f13b049eaull, 8)), // atan Q coefficients
320 : _2(Vc_buildDouble( 1, 0x8519efbbd62ecull, 7)), // atan Q coefficients
321 : _2(Vc_buildDouble( 1, 0x3504f333f9de6ull, 1)), // tan( 3/8 π )
322 : _2(0.66), // lower threshold for special casing in atan
323 : // cacheline 13
324 : _2(Vc_buildDouble(1, 0x1A62633145C07ull, -54)), // remainder of pi/2
325 : _2(1.e-8), // small asin input threshold
326 : _2(0.625), // large asin input threshold
327 : _2(0.), // padding
328 : // cacheline 14
329 : _2(Vc_buildDouble( 1, 0x84fc3988e9f08ull, -9)), // asinCoeff0
330 : _2(Vc_buildDouble(-1, 0x2079259f9290full, -1)), // asinCoeff0
331 : _2(Vc_buildDouble( 1, 0xbdff5baf33e6aull, 2)), // asinCoeff0
332 : _2(Vc_buildDouble(-1, 0x991aaac01ab68ull, 4)), // asinCoeff0
333 : // cacheline 15
334 : _2(Vc_buildDouble( 1, 0xc896240f3081dull, 4)), // asinCoeff0
335 : _2(Vc_buildDouble(-1, 0x5f2a2b6bf5d8cull, 4)), // asinCoeff1
336 : _2(Vc_buildDouble( 1, 0x26219af6a7f42ull, 7)), // asinCoeff1
337 : _2(Vc_buildDouble(-1, 0x7fe08959063eeull, 8)), // asinCoeff1
338 : // cacheline 16
339 : _2(Vc_buildDouble( 1, 0x56709b0b644beull, 8)), // asinCoeff1
340 : _2(Vc_buildDouble( 1, 0x16b9b0bd48ad3ull, -8)), // asinCoeff2
341 : _2(Vc_buildDouble(-1, 0x34341333e5c16ull, -1)), // asinCoeff2
342 : _2(Vc_buildDouble( 1, 0x5c74b178a2dd9ull, 2)), // asinCoeff2
343 : // cacheline 17
344 : _2(Vc_buildDouble(-1, 0x04331de27907bull, 4)), // asinCoeff2
345 : _2(Vc_buildDouble( 1, 0x39007da779259ull, 4)), // asinCoeff2
346 : _2(Vc_buildDouble(-1, 0x0656c06ceafd5ull, 3)), // asinCoeff2
347 : _2(Vc_buildDouble(-1, 0xd7b590b5e0eabull, 3)), // asinCoeff3
348 : // cacheline 18
349 : _2(Vc_buildDouble( 1, 0x19fc025fe9054ull, 6)), // asinCoeff3
350 : _2(Vc_buildDouble(-1, 0x265bb6d3576d7ull, 7)), // asinCoeff3
351 : _2(Vc_buildDouble( 1, 0x1705684ffbf9dull, 7)), // asinCoeff3
352 : _2(Vc_buildDouble(-1, 0x898220a3607acull, 5)), // asinCoeff3
353 : };
354 : #undef _2
355 : #define _4(x) x, x, x, x
356 : template<> const float c_trig<float>::data[] = {
357 : // cacheline
358 : _4(Vc_buildFloat( 1, 0x490FDB, -1)), // π/4
359 : _4(Vc_buildFloat( 1, 0x491000, -1)), // π/4 - 12 bits precision
360 : _4(Vc_buildFloat(-1, 0x157000, -19)), // π/4 remainder1 - 12 bits precision
361 : _4(Vc_buildFloat(-1, 0x6F4B9F, -32)), // π/4 remainder2
362 : // cacheline
363 : _4(0.0625f),
364 : _4(16.f),
365 : _4(0.f), // padding
366 : _4(0.f), // padding
367 : // cacheline
368 : _4(4.166664568298827e-2f), // ~ 1/4!
369 : _4(-1.388731625493765e-3f), // ~-1/6!
370 : _4(2.443315711809948e-5f), // ~ 1/8!
371 : _4(0.f), // padding (for alignment with double)
372 : // cacheline
373 : _4(0.f), // padding (for alignment with double)
374 : _4(0.f), // padding (for alignment with double)
375 : _4(-1.6666654611e-1f), // ~-1/3!
376 : _4(8.3321608736e-3f), // ~ 1/5!
377 : // cacheline
378 : _4(-1.9515295891e-4f), // ~-1/7!
379 : _4(0.f), // padding (for alignment with double)
380 : _4(0.f), // padding (for alignment with double)
381 : _4(0.f), // padding (for alignment with double)
382 : // cacheline
383 : _4(8192.f), // loss threshold
384 : _4(Vc_buildFloat(1, 0x22F983, 0)), // 1.27323949337005615234375 = 4/π
385 : _4(Vc_buildFloat(1, 0x490FDB, 0)), // π/2
386 : _4(Vc_buildFloat(1, 0x490FDB, 1)), // π
387 : // cacheline
388 : _4(8.05374449538e-2f), // atan P coefficients
389 : _4(1.38776856032e-1f), // atan P coefficients
390 : _4(1.99777106478e-1f), // atan P coefficients
391 : _4(3.33329491539e-1f), // atan P coefficients
392 : // cacheline
393 : _4(0.f), // padding (for alignment with double)
394 : _4(0.f), // padding (for alignment with double)
395 : _4(0.f), // padding (for alignment with double)
396 : _4(0.f), // padding (for alignment with double)
397 : // cacheline
398 : _4(0.f), // padding (for alignment with double)
399 : _4(0.f), // padding (for alignment with double)
400 : _4(2.414213562373095f), // tan( 3/8 π )
401 : _4(0.414213562373095f), // tan( 1/8 π ) lower threshold for special casing in atan
402 : // cacheline
403 : _4(Vc_buildFloat(-1, 0x3BBD2E, -25)), // remainder of pi/2
404 : _4(1.e-4f), // small asin input threshold
405 : _4(0.f), // padding (for alignment with double)
406 : _4(0.f), // padding (for alignment with double)
407 : // cacheline
408 : _4(4.2163199048e-2f), // asinCoeff0
409 : _4(2.4181311049e-2f), // asinCoeff0
410 : _4(4.5470025998e-2f), // asinCoeff0
411 : _4(7.4953002686e-2f), // asinCoeff0
412 : // cacheline
413 : _4(1.6666752422e-1f), // asinCoeff0
414 : _4(0.f), // padding (for alignment with double)
415 : _4(0.f), // padding (for alignment with double)
416 : _4(0.f), // padding (for alignment with double)
417 : // cacheline
418 : _4(0.f), // padding (for alignment with double)
419 : _4(0.f), // padding (for alignment with double)
420 : _4(0.f), // padding (for alignment with double)
421 : _4(0.f), // padding (for alignment with double)
422 : // cacheline
423 : _4(0.f), // padding (for alignment with double)
424 : _4(0.f), // padding (for alignment with double)
425 : _4(0.f), // padding (for alignment with double)
426 : _4(0.f), // padding (for alignment with double)
427 : // cacheline
428 : _4(0.f), // padding (for alignment with double)
429 : _4(0.f), // padding (for alignment with double)
430 : _4(0.f), // padding (for alignment with double)
431 : _4(0.f), // padding (for alignment with double)
432 : };
433 : #undef _4
434 :
435 : // cacheline 8
436 : V_ALIGN(16) extern const unsigned char _IndexesFromZero16[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
437 :
438 : V_ALIGN(64) const unsigned long long c_log<double>::data[21 * 2] = {
439 : /* 0*/ 0x000003ff000003ffull, 0x000003ff000003ffull // bias TODO: remove
440 : /* 1*/ , 0x7ff0000000000000ull, 0x7ff0000000000000ull // exponentMask (+inf)
441 :
442 : /* 2*/ , 0x3f1ab4c293c31bb0ull, 0x3f1ab4c293c31bb0ull // P[0]
443 : /* 3*/ , 0x3fdfd6f53f5652f2ull, 0x3fdfd6f53f5652f2ull // P[1]
444 : /* 4*/ , 0x4012d2baed926911ull, 0x4012d2baed926911ull // P[2]
445 : /* 5*/ , 0x402cff72c63eeb2eull, 0x402cff72c63eeb2eull // P[3]
446 : /* 6*/ , 0x4031efd6924bc84dull, 0x4031efd6924bc84dull // P[4]
447 : /* 7*/ , 0x401ed5637d7edcf8ull, 0x401ed5637d7edcf8ull // P[5]
448 :
449 : /* 8*/ , 0x40269320ae97ef8eull, 0x40269320ae97ef8eull // Q[0]
450 : /* 9*/ , 0x40469d2c4e19c033ull, 0x40469d2c4e19c033ull // Q[1]
451 : /*10*/ , 0x4054bf33a326bdbdull, 0x4054bf33a326bdbdull // Q[2]
452 : /*11*/ , 0x4051c9e2eb5eae21ull, 0x4051c9e2eb5eae21ull // Q[3]
453 : /*12*/ , 0x4037200a9e1f25b2ull, 0x4037200a9e1f25b2ull // Q[4]
454 :
455 : /*13*/ , 0xfff0000000000000ull, 0xfff0000000000000ull // -inf
456 : /*14*/ , 0x0010000000000000ull, 0x0010000000000000ull // min()
457 : /*15*/ , 0x3fe6a09e667f3bcdull, 0x3fe6a09e667f3bcdull // 1/sqrt(2)
458 : /*16*/ , 0x3fe6300000000000ull, 0x3fe6300000000000ull // round(ln(2) * 512) / 512
459 : /*17*/ , 0xbf2bd0105c610ca8ull, 0xbf2bd0105c610ca8ull // ln(2) - round(ln(2) * 512) / 512
460 : /*18*/ , 0x3fe0000000000000ull, 0x3fe0000000000000ull // 0.5
461 : /*19*/ , 0x3fdbcb7b1526e50eull, 0x3fdbcb7b1526e50eull // log10(e)
462 : /*20*/ , 0x3ff71547652b82feull, 0x3ff71547652b82feull // log2(e)
463 : };
464 :
465 : template<> V_ALIGN(64) const unsigned int c_log<float>::data[21 * 4] = {
466 : 0x0000007fu, 0x0000007fu, 0x0000007fu, 0x0000007fu, // bias TODO: remove
467 : 0x7f800000u, 0x7f800000u, 0x7f800000u, 0x7f800000u, // exponentMask (+inf)
468 :
469 : 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, 0x3d9021bbu, // 7.0376836292e-2f // P[0]
470 : 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, 0xbdebd1b8u, // -1.1514610310e-1f // P[1]
471 : 0x3def251au, 0x3def251au, 0x3def251au, 0x3def251au, // 1.1676998740e-1f // P[2]
472 : 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, 0xbdfe5d4fu, // -1.2420140846e-1f // P[3]
473 : 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, 0x3e11e9bfu, // 1.4249322787e-1f // P[4]
474 : 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, 0xbe2aae50u, // -1.6668057665e-1f // P[5]
475 : 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, 0x3e4cceacu, // 2.0000714765e-1f // P[6]
476 : 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, 0xbe7ffffcu, // -2.4999993993e-1f // P[7]
477 : 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, 0x3eaaaaaau, // 3.3333331174e-1f // P[8]
478 : 0, 0, 0, 0, // padding because of c_log<double>
479 : 0, 0, 0, 0, // padding because of c_log<double>
480 :
481 : 0xff800000u, 0xff800000u, 0xff800000u, 0xff800000u, // -inf
482 : 0x00800000u, 0x00800000u, 0x00800000u, 0x00800000u, // min()
483 : 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, 0x3f3504f3u, // 1/sqrt(2)
484 : // ln(2) = 0x3fe62e42fefa39ef
485 : // ln(2) = Vc_buildDouble( 1, 0x00062e42fefa39ef, -1)
486 : // = Vc_buildFloat( 1, 0x00317217(f7d), -1) + Vc_buildFloat( 1, 0x0077d1cd, -25)
487 : // = Vc_buildFloat( 1, 0x00318000(000), -1) + Vc_buildFloat(-1, 0x005e8083, -13)
488 : 0x3f318000u, 0x3f318000u, 0x3f318000u, 0x3f318000u, // round(ln(2) * 512) / 512
489 : 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, 0xb95e8083u, // ln(2) - round(ln(2) * 512) / 512
490 : 0x3f000000u, 0x3f000000u, 0x3f000000u, 0x3f000000u, // 0.5
491 : 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, 0x3ede5bd9u, // log10(e)
492 : 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, 0x3fb8aa3bu, // log2(e)
493 : // log10(2) = 0x3fd34413509f79ff
494 : // = Vc_buildDouble( 1, 0x00034413509f79ff, -2)
495 : // = Vc_buildFloat( 1, 0x001a209a(84fbcff8), -2) + Vc_buildFloat( 1, 0x0004fbcff(8), -26)
496 : //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
497 : //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
498 : //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
499 : //Vc_buildFloat( 1, 0x001a209a, -2), // log10(2)
500 : };
501 : } // namespace SSE
502 :
503 : V_ALIGN(64) unsigned int RandomState[16] = {
504 : 0x5a383a4fu, 0xc68bd45eu, 0x691d6d86u, 0xb367e14fu,
505 : 0xd689dbaau, 0xfde442aau, 0x3d265423u, 0x1a77885cu,
506 : 0x36ed2684u, 0xfb1f049du, 0x19e52f31u, 0x821e4dd7u,
507 : 0x23996d25u, 0x5962725au, 0x6aced4ceu, 0xd4c610f3u
508 : };
509 :
510 : // dummy symbol to emit warnings with GCC 4.3
511 : namespace Warnings {
512 0 : void _operator_bracket_warning() {}
513 : } // namespace Warnings
514 :
515 : const char LIBRARY_VERSION[] = VC_VERSION_STRING;
516 : const unsigned int LIBRARY_VERSION_NUMBER = VC_VERSION_NUMBER;
517 : const unsigned int LIBRARY_ABI_VERSION = VC_LIBRARY_ABI_VERSION;
518 :
519 : void checkLibraryAbi(unsigned int compileTimeAbi, unsigned int versionNumber, const char *compileTimeVersion) {
520 6 : if (LIBRARY_ABI_VERSION != compileTimeAbi || LIBRARY_VERSION_NUMBER < versionNumber) {
521 0 : printf("The versions of libVc.a (%s) and Vc/version.h (%s) are incompatible. Aborting.\n", LIBRARY_VERSION, compileTimeVersion);
522 0 : abort();
523 : }
524 3 : }
525 :
526 : } // namespace Vc
527 : } // namespace AliRoot
528 :
529 : #undef V_ALIGN
|