Peter Bruin on Tue, 08 Sep 2015 11:47:12 +0200

 Speed up {Flx,FpX,FpXQX}_divrem_basecase for suitable polynomials

• To: pari-dev@pari.math.u-bordeaux1.fr
• Subject: Speed up {Flx,FpX,FpXQX}_divrem_basecase for suitable polynomials
• From: Peter Bruin <P.J.Bruin@math.leidenuniv.nl>
• Date: Tue, 08 Sep 2015 11:47:30 +0200
• Cancel-lock: sha1:fsUkJu3hJ9kBo9+GE+FDWOXmFVQ=
• Delivery-date: Tue, 08 Sep 2015 11:47:13 +0200
• User-agent: Gnus/5.13 (Gnus v5.13) Emacs/24.5 (gnu/linux)

```Bonjour,

When computing with non-prime finite fields, one is often free to choose
a defining polynomial f (of some given degree n) over the prime field.
If we write f in the form c*X^n + f1 with deg(f1) < n, then for division
with remainder modulo f, it "should be" best to take deg(f1) as small as
possible.  However, the current code for division with remainder in PARI
does not yet give an advantage in that case.

The attached patch modifies the functions {Flx,FpX,FpXQX}_divrem_basecase
and Flx_rem_basecase in order to reduce the complexity of computing the
(quotient and) remainder of g by f from O((deg g - deg f)(deg f)) to
O((deg g - deg f)(deg f1)).

It passes the test suite and gives a noticeable speedup in my code for
computing with curves over finite fields.  For example, with this patch,
multiplying two 10 x 10 matrices over a field of size 59^100 becomes
about 25% faster when choosing a polynomial of the form x^100 + O(x^3)
instead of an arbitrary polynomial of degree 100 over F_59; previously
there was no visible difference.

Thanks,

Peter

```
```diff --git a/src/basemath/Flx.c b/src/basemath/Flx.c
index 9387df7..8d7fe24 100644
--- a/src/basemath/Flx.c
+++ b/src/basemath/Flx.c
@@ -1179,7 +1179,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
{
pari_sp av;
GEN z, c;
-  long dx,dy,dz,i,j;
+  long dx,dy,dy1,dz,i,j;
ulong p1,inv;
long vs=x[1];

@@ -1189,6 +1189,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
x += 2; y += 2;
inv = y[dy];
if (inv != 1UL) inv = Fl_inv(inv,p);
+  for (dy1=dy-1; dy1>=0 && !y[dy1]; dy1--);

c = cgetg(dy+3, t_VECSMALL); c[1]=vs; c += 2; av=avma;
z = cgetg(dz+3, t_VECSMALL); z[1]=vs; z += 2;
@@ -1199,7 +1200,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
for (i=dx-1; i>=dy; --i)
{
p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
{
p1 += z[j]*y[i-j];
if (p1 & HIGHBIT) p1 %= p;
@@ -1210,7 +1211,7 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
for (i=0; i<dy; i++)
{
p1 = z[0]*y[i];
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
{
p1 += z[j]*y[i-j];
if (p1 & HIGHBIT) p1 %= p;
@@ -1225,14 +1226,14 @@ Flx_rem_basecase(GEN x, GEN y, ulong p)
for (i=dx-1; i>=dy; --i)
{
p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
p1 = Fl_addmul_pre(z[j], y[i-j], p1, p, pi);
z[i-dy] = p1? Fl_mul_pre(p - p1, inv, p, pi): 0;
}
for (i=0; i<dy; i++)
{
p1 = Fl_mul_pre(z[0],y[i],p,pi);
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
c[i] = Fl_sub(x[i], p1, p);
}
@@ -1248,7 +1249,7 @@ static GEN
Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
{
GEN z,q,c;
-  long dx,dy,dz,i,j;
+  long dx,dy,dy1,dz,i,j;
ulong p1,inv;
long sv=x[1];

@@ -1274,6 +1275,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
z = cgetg(dz + 3, t_VECSMALL); z[1] = sv; z += 2;
inv = uel(y, dy);
if (inv != 1UL) inv = Fl_inv(inv,p);
+  for (dy1=dy-1; dy1>=0 && !y[dy1]; dy1--);

if (SMALL_ULONG(p))
{
@@ -1281,7 +1283,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
for (i=dx-1; i>=dy; --i)
{
p1 = p - x[i]; /* compute -p1 instead of p1 (pb with ulongs otherwise) */
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
{
p1 += z[j]*y[i-j];
if (p1 & HIGHBIT) p1 %= p;
@@ -1296,7 +1298,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
for (i=dx-1; i>=dy; --i)
{ /* compute -p1 instead of p1 (pb with ulongs otherwise) */
p1 = p - uel(x,i);
-      for (j=i-dy+1; j<=i && j<=dz; j++)
+      for (j=i-dy1; j<=i && j<=dz; j++)
z[i-dy] = p1? Fl_mul(p - p1, inv, p): 0;
}
@@ -1310,7 +1312,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
for (i=0; i<dy; i++)
{
p1 = (ulong)z[0]*y[i];
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
{
p1 += (ulong)z[j]*y[i-j];
if (p1 & HIGHBIT) p1 %= p;
@@ -1323,7 +1325,7 @@ Flx_divrem_basecase(GEN x, GEN y, ulong p, GEN *pr)
for (i=0; i<dy; i++)
{
p1 = Fl_mul(z[0],y[i],p);
-      for (j=1; j<=i && j<=dz; j++)
+      for (j=maxss(1,i-dy1); j<=i && j<=dz; j++)
c[i] = Fl_sub(x[i], p1, p);
}
diff --git a/src/basemath/FpX.c b/src/basemath/FpX.c
index f601efe..1c5a1dc 100644
--- a/src/basemath/FpX.c
+++ b/src/basemath/FpX.c
@@ -350,7 +350,7 @@ FpX_halve(GEN y, GEN p)
static GEN
FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
{
-  long vx, dx, dy, dz, i, j, sx, lr;
+  long vx, dx, dy, dy1, dz, i, j, sx, lr;
pari_sp av0, av;

@@ -400,13 +400,14 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
avma = av0;
z=cgetg(dz+3,t_POL); z[1] = x[1];
x += 2; y += 2; z += 2;
+  for (dy1=dy-1; dy1>=0 && !signe(gel(y, dy1)); dy1--);

p1 = gel(x,dx); av = avma;
for (i=dx-1; i>=dy; i--)
{
av=avma; p1=gel(x,i);
-    for (j=i-dy+1; j<=i && j<=dz; j++)
+    for (j=i-dy1; j<=i && j<=dz; j++)
p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
gel(z,i-dy) = gerepileuptoint(av,modii(p1, p));
@@ -417,7 +418,7 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
for (sx=0; ; i--)
{
p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
p1 = modii(p1,p); if (signe(p1)) { sx = 1; break; }
if (!i) break;
@@ -437,7 +438,7 @@ FpX_divrem_basecase(GEN x, GEN y, GEN p, GEN *pr)
for (i--; i>=0; i--)
{
av=avma; p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
p1 = subii(p1, mulii(gel(z,j),gel(y,i-j)));
gel(rem,i) = gerepileuptoint(av, modii(p1,p));
}
diff --git a/src/basemath/FpXX.c b/src/basemath/FpXX.c
index ee6b8f3..2c17c4f 100644
--- a/src/basemath/FpXX.c
+++ b/src/basemath/FpXX.c
@@ -321,7 +321,7 @@ FpXQX_FpXQ_mul(GEN P, GEN U, GEN T, GEN p)
static GEN
FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
{
-  long vx, dx, dy, dz, i, j, sx, lr;
+  long vx, dx, dy, dy1, dz, i, j, sx, lr;
pari_sp av0, av, tetpil;

@@ -373,13 +373,14 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
avma = av0;
z = cgetg(dz+3,t_POL); z[1] = x[1];
x += 2; y += 2; z += 2;
+  for (dy1=dy-1; dy1>=0 && !signe(gel(y, dy1)); dy1--);

p1 = gel(x,dx); av = avma;
for (i=dx-1; i>=dy; i--)
{
av=avma; p1=gel(x,i);
-    for (j=i-dy+1; j<=i && j<=dz; j++)
+    for (j=i-dy1; j<=i && j<=dz; j++)
p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j),NULL,p),NULL,p);
tetpil=avma; gel(z,i-dy) = gerepile(av,tetpil,Fq_red(p1,T,p));
@@ -390,7 +391,7 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
for (sx=0; ; i--)
{
p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j),NULL,p),NULL,p);
tetpil=avma; p1 = Fq_red(p1, T, p); if (signe(p1)) { sx = 1; break; }
if (!i) break;
@@ -410,7 +411,7 @@ FpXQX_divrem_basecase(GEN x, GEN y, GEN T, GEN p, GEN *pr)
for (i--; i>=0; i--)
{
av=avma; p1 = gel(x,i);
-    for (j=0; j<=i && j<=dz; j++)
+    for (j=maxss(0,i-dy1); j<=i && j<=dz; j++)
p1 = Fq_sub(p1, Fq_mul(gel(z,j),gel(y,i-j), NULL,p), NULL,p);
tetpil=avma; gel(rem,i) = gerepile(av,tetpil, Fq_red(p1, T, p));
}
```