Jean-Pierre Flori on Fri, 15 Jan 2016 11:28:47 +0100 |
[Date Prev] [Date Next] [Thread Prev] [Thread Next] [Date Index] [Thread Index]
Re: Faster exponentiation in some extensions of finite fields of small characteristics |
Dear all, Here is a simplified patch only adding the 4-into-1 stuff. It could surely be refactorized as well, but it's simple enough to be included. It lacks proper tuning. Improvement example: * before: ? a = ffgen(7^1000,'a) time = 32 ms. %1 = a ? a^(7^1000) time = 724 ms. %2 = a * after: ? a = ffgen(7^1000,'a) time = 20 ms. %1 = a ? a^(7^1000) time = 368 ms. %2 = a so a 2x speedup as expected. Best, 2016-01-14 17:05 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: > Looking at what Peter did, I now see I should just wrap his code! > http://pari.math.u-bordeaux.fr/archives/pari-dev-1510/msg00004.html > > 2016-01-14 13:32 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: >> Should be better now. >> >> 2016-01-14 13:22 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: >>> It seems it was an old buggy version of the patch. >>> I'm trying to relocate the right commit. >>> >>> 2016-01-14 11:58 GMT+01:00 Jean-Pierre Flori <jpflori@gmail.com>: >>>> Dear all, >>>> >>>> Here is a preliminary patch to speed up exponentiation in some >>>> extensions of finite fields of small characteristics (< size of >>>> machine word/2). >>>> >>>> It packs more coefficients of the polynomial representation over the >>>> prime field into machine words when the finite field element is mapped >>>> to a multiprecision integer (i.e. KS). >>>> Two functions are added: >>>> * one function which packs 4 coeffs into a machine word, >>>> * one generic funciton which packs the coeffs as much as possible >>>> possibly crossing machine words boundaries. >>>> >>>> I did not take care of adding new tuning parameters or smartly >>>> choosing between the different functions, e.g. calling the >>>> 4-coeffs-in-1-word function when the (product) coeff size is >>>> BITS_IN_LONG/4-\epsilon might be more efficient than using the generic >>>> function which does more complicated packing when the polynomial >>>> degree is not large. >>>> >>>> I did not add (yet) optimal packing when the product coeffs are larger >>>> than a machine word. >>>> >>>> I did not really check it is bug free. >>>> >>>> Best, >>>> -- >>>> Jean-Pierre Flori >>> >>> >>> >>> -- >>> Jean-Pierre Flori >> >> >> >> -- >> Jean-Pierre Flori > > > > -- > Jean-Pierre Flori -- Jean-Pierre Flori
diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c index 715cb02..e071788 100644 --- a/src/basemath/Flx.c +++ b/src/basemath/Flx.c @@ -594,6 +594,12 @@ Flx_shiftip(pari_sp av, GEN x, long v) avma = (pari_sp)y; return y; } +#define BITS_IN_QUARTULONG (BITS_IN_HALFULONG >> 1) +#define QUARTMASK ((1UL<<BITS_IN_QUARTULONG)-1UL) +#define LLQUARTWORD(x) ((x) & QUARTMASK) +#define HLQUARTWORD(x) (((x) >> BITS_IN_QUARTULONG) & QUARTMASK) +#define LHQUARTWORD(x) (((x) >> (2*BITS_IN_QUARTULONG)) & QUARTMASK) +#define HHQUARTWORD(x) (((x) >> (3*BITS_IN_QUARTULONG)) & QUARTMASK) INLINE long maxlengthcoeffpol(ulong p, long n) { @@ -601,7 +607,11 @@ maxlengthcoeffpol(ulong p, long n) GEN z = muliu(sqru(p-1), n); long l = lgefint(z); avma = ltop; - if (l==3 && HIGHWORD(z[2])==0) return 0; + if (l==3 && HIGHWORD(z[2])==0) + { + if (HLQUARTWORD(z[2]) == 0) return -1; + else return 0; + } return l-2; } @@ -708,6 +718,57 @@ Flx_mulspec_halfmulii(GEN a, GEN b, ulong p, long na, long nb) return int_to_Flx_half(z,p); } +static GEN +Flx_to_int_quartspec(GEN a, long na) +{ + long j; + long n = (na+3)>>2UL; + GEN V = cgetipos(2+n); + GEN w; + for (w = int_LSW(V), j=0; j+3<na; j+=4, w=int_nextW(w)) + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG))|(a[j+3]<<(3*BITS_IN_QUARTULONG)); + switch (na-j) + { + case 3: + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG)|(a[j+2]<<(2*BITS_IN_QUARTULONG)); + break; + case 2: + *w = a[j]|(a[j+1]<<BITS_IN_QUARTULONG); + break; + case 1: + *w = a[j]; + break; + case 0: + break; + } + return V; +} + +static GEN +int_to_Flx_quart(GEN z, ulong p) +{ + long i; + long lx = (lgefint(z)-2)*4+2; + GEN w, x = cgetg(lx, t_VECSMALL); + for (w = int_LSW(z), i=2; i<lx; i+=4, w=int_nextW(w)) + { + x[i] = LLQUARTWORD((ulong)*w)%p; + x[i+1] = HLQUARTWORD((ulong)*w)%p; + x[i+2] = LHQUARTWORD((ulong)*w)%p; + x[i+3] = HHQUARTWORD((ulong)*w)%p; + } + return Flx_renormalize(x, lx); +} + +static GEN +Flx_mulspec_quartmulii(GEN a, GEN b, ulong p, long na, long nb) +{ + GEN A = Flx_to_int_quartspec(a,na); + GEN B = Flx_to_int_quartspec(b,nb); + GEN z = mulii(A,B); + return int_to_Flx_quart(z,p); +} + /*Eval x in 2^(k*BIL) in linear time, k==2 or 3*/ static GEN Flx_eval2BILspec(GEN x, long k, long l) @@ -766,6 +827,7 @@ Flx_mulspec_mulii_inflate(GEN x, GEN y, long N, ulong p, long nx, long ny) return gerepileupto(av, Z_mod2BIL_Flx(z, N, nx+ny-2, p)); } +#define Flx_MUL_QUARTMULII_LIMIT Flx_MUL_HALFMULII_LIMIT /* fast product (Karatsuba) of polynomials a,b. These are not real GENs, a+2, * b+2 were sent instead. na, nb = number of terms of a, b. * Only c, c0, c1, c2 are genuine GEN. @@ -785,6 +847,10 @@ Flx_mulspec(GEN a, GEN b, ulong p, long na, long nb) av = avma; switch (maxlengthcoeffpol(p,nb)) { + case -1: + if (na>=Flx_MUL_QUARTMULII_LIMIT) + return Flx_shiftip(av,Flx_mulspec_quartmulii(a,b,p,na,nb), v); + break; case 0: if (na>=Flx_MUL_HALFMULII_LIMIT) return Flx_shiftip(av,Flx_mulspec_halfmulii(a,b,p,na,nb), v); @@ -910,6 +976,13 @@ Flx_sqrspec_halfsqri(GEN a, ulong p, long na) } static GEN +Flx_sqrspec_quartsqri(GEN a, ulong p, long na) +{ + GEN z = sqri(Flx_to_int_quartspec(a,na)); + return int_to_Flx_quart(z,p); +} + +static GEN Flx_sqrspec_sqri_inflate(GEN x, long N, ulong p, long nx) { pari_sp av = avma; @@ -917,6 +990,7 @@ Flx_sqrspec_sqri_inflate(GEN x, long N, ulong p, long nx) return gerepileupto(av, Z_mod2BIL_Flx(z, N, (nx-1)*2, p)); } +#define Flx_SQR_QUARTSQRI_LIMIT Flx_SQR_HALFSQRI_LIMIT static GEN Flx_sqrspec(GEN a, ulong p, long na) { @@ -930,6 +1004,10 @@ Flx_sqrspec(GEN a, ulong p, long na) av = avma; switch(maxlengthcoeffpol(p,na)) { + case -1: + if (na>=Flx_SQR_QUARTSQRI_LIMIT) + return Flx_shiftip(av, Flx_sqrspec_quartsqri(a,p,na), v); + break; case 0: if (na>=Flx_SQR_HALFSQRI_LIMIT) return Flx_shiftip(av, Flx_sqrspec_halfsqri(a,p,na), v); @@ -1033,6 +1111,11 @@ Flx_multhreshold(GEN T, ulong p, long half, long mul, long mul2, long kara) long na = lgpol(T); switch (maxlengthcoeffpol(p,na)) { + case -1: + /* To be fixed for quart */ + if (na>=Flx_MUL_QUARTMULII_LIMIT) + return na>=half; + break; case 0: if (na>=Flx_MUL_HALFMULII_LIMIT) return na>=half;