/*	The F4 Groebner Basis Engine from axcas.net
	by Roman Pearce, February 2026

	This code is released into the public domain.

	This software and documentation is provided "as is", without warranty of any kind,
	express or implied, including but not limited to the warranties of merchantability,
	fitness for a particular purpose, and noninfringement.  In no event shall the authors
	or copyright holders be liable for any claim, damages, or other liability, whether in
	an action of contract, tort, or otherwise, arising from, out of or in connection with
	the software or the use or other dealings in the software.
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#define MULTI_THREAD	1
#define MAX_CPU		2048
int num_thread = 0;
int info = 3;


/*	Machine integer routines

	We assume the following non-portable things:
	- two's complement integer arithmetic
	- a signed right shift duplicates the sign bit
	- a flat memory model that is byte addressable
	- malloc aligns memory to the word size
*/

#define INT32 int
#define INT64 long long int
#define UINT32 unsigned int
#define UINT64 unsigned long long int
#define CHAR   unsigned char

/* determine word size */
#if UINTPTR_MAX==0xFFFFFFFF
	#define WORDSIZE 32
	typedef INT32	INT;
	typedef UINT32	UINT;
#elif UINTPTR_MAX==0xFFFFFFFFFFFFFFFF
	#define WORDSIZE 64
	typedef INT64	INT;
	typedef UINT64	UINT;
#else
	#error port WORDSIZE
#endif
#define I(x) ((INT)(x))
#define U(x) ((UINT)(x))


/* platform specific assembly support */
#if defined(_MSC_VER) && defined(_M_X64)
	#define MSCx64
	#include <intrin.h>
	#pragma intrinsic(_umul128)
	#pragma intrinsic(_addcarry_u64)
	#pragma intrinsic(_subborrow_u64)
#elif defined(__GNUC__) && defined(__x86_64__)
	#define GNUx64
#elif defined(__GNUC__) && defined(__arm64__)
	#define GNUa64
#elif defined(__GNUC__) && defined(__aarch64__)
	#define GNUa64
#endif


/* floating point sign */
#define fsign(x) (((x) < 0) ? -1 : ((x) > 0))

/* print bits */
void uprint(UINT x)
{
	int i;
	i = WORDSIZE-1;
	for (; i >= 0; i--) {
		printf("%s", (x >> i) & 1 ? "1" : "0");
	}
	printf("\n");
}


/* hash x into h */
UINT uhash(UINT h, UINT x)
{
#if WORDSIZE==32
	return (h ^ x)*270566475UL;
#else
	return (h ^ x)*36064050381096011ULL;
#endif
}


/* xorshift from Marsaglia */
UINT urandom32()
{
	static UINT32 rs = 2463534242UL;
	rs ^= (rs << 13);
	rs ^= (rs >> 17);
	rs ^= (rs << 5);
	return rs;
}

UINT urandom()
{
	UINT64 ss;
	static UINT32 rs = 2463534242UL;
	rs ^= (rs << 13);
	rs ^= (rs >> 17);
	rs ^= (rs << 5);
	if (WORDSIZE==32) return rs;
	ss = rs;
	rs ^= (rs << 13);
	rs ^= (rs >> 17);
	rs ^= (rs << 5);
	ss = (ss << 32) | rs;
	return (UINT)ss;
}

/* non-zero */
INT srandmod(INT p)
{
	INT x;
doit:	x = (urandom() % p);
	if (x==0) goto doit;
	return x;
}


/* absolute value */
UINT uabs(INT x)
{
	UINT s;
	s = x >> (WORDSIZE-1);
	return (x + s) ^ s;
}
#define ABS(x) uabs(x)


/* unsigned maximum */
UINT umax(UINT a, UINT b)
{
	return a > b ? a : b;
}


/* unsigned minimum */
UINT umin(UINT a, UINT b)
{
	return a > b ? b : a;
}


/* next power of two */
UINT up2(UINT x)
{
	x--;
	x |= x >> 2;
	x |= x >> 1;
	x |= x >> 4;
	x |= x >> 8;
	x |= x >> 16;
#if WORDSIZE==64
	x |= x >> 32;
#elif WORDSIZE > 64
	#error "port up2"
#endif
	return x+1;
}


/* count leading zeroes */
UINT ulz(UINT x)
{
	UINT n=0;
	if (!x) return WORDSIZE;
#if WORDSIZE==32
	if (x <= 0x0000FFFF) n += 16, x = x << 16;
	if (x <= 0x00FFFFFF) n += 8, x = x << 8;
	if (x <= 0x0FFFFFFF) n += 4, x = x << 4;
	if (x <= 0x3FFFFFFF) n += 2, x = x << 2;
	if (x <= 0x7FFFFFFF) n++;
#elif WORDSIZE==64
	if (x <= 0x00000000FFFFFFFF) n += 32, x = x << 32;
	if (x <= 0x0000FFFFFFFFFFFF) n += 16, x = x << 16;
	if (x <= 0x00FFFFFFFFFFFFFF) n += 8, x = x << 8;
	if (x <= 0x0FFFFFFFFFFFFFFF) n += 4, x = x << 4;
	if (x <= 0x3FFFFFFFFFFFFFFF) n += 2, x = x << 2;
	if (x <= 0x7FFFFFFFFFFFFFFF) n++;
#else
	#error port ulz
#endif
	return n;
}


/* count trailing zeroes */
UINT utz(UINT x)
{
	UINT n;
	for (n=0; !(x & 1); n++) x = x >> 1;
	return n;
}


/* popcount */
UINT upop(UINT n)
{
	UINT i;
	for (i=0; n; i++) n &= n-1;
	return i;
}


/* floor of log2 for n > 0 */
UINT ulog2(UINT n)
{
	return WORDSIZE-1-ulz(n);
}


/* modified Euclid */
UINT ugcd(UINT a, UINT b)
{
	UINT c;
	while (a && b) {
		if (a < b) c = a, a = b, b = c;
		c = a % b;
		a = b - c;
		b = c;
	}
	return (a | b);
}


/* a^n binary powering */
UINT upow(UINT a, UINT n)
{
	UINT r, s;
	r = 1, s = a;
	if (n & 1) r = s;
	n = n >> 1;
	while (n) {
		s = s * s;
		/* from LSB to MSB */
		if (n & 1) r = r * s;
		n = n >> 1;
	}
	return r;
}

/* 10^n lookup */
UINT upow10(UINT n)
{
#if WORDSIZE==32
	static UINT table[10] = {0};
#elif WORDSIZE==64
	static UINT table[20] = {0};
#else
	#error port upow10
#endif
	if (table[n]) return table[n];
	table[n] = upow(10,n);
	return table[n];
}

/* floor of log(a) base b */
UINT ulog(UINT a, UINT b)
{
	UINT x, y, i;
	if (b <= 1) return 0;
	for (x=1, i=0; i < WORDSIZE; i++) {
		y = x*b;
		if (y > a) break;
		if (y/b != x) break;
		x = y;
	}
	return i;
}


/* floor of a^(1/n) */
UINT uroot(UINT a, UINT n)
{
	UINT i, j, k, x, y, e, w;
	w = WORDSIZE;
	if (n == 0) return 0;
	if (n == 1) return a;
	if (a <= 1) return a;
	if (n >= w) return 1;
	/* binary search */
	i = k = 1;
	k = k << w/n;
	while (i <= k) {
		j = (i + k)/2;
		/* x = j to the nth power */
		for (x=j, e=1; e < n; e++) {
			y = x*j;
			/* overflow check */
			if (y/j != x) break;
			x = y;
		}
		/* x too big or too small */
		if (e < n || x > a) k = j-1;
		else if (x < a) i = j+1;
		else return j;
	}
	return k;
}


/* 1/a mod b from Knuth */
UINT uinvmod(UINT a, UINT b)
{
	UINT p, q, r, x, y, i;
	x = 0, y = 1, p = b;
	/* y < 0 if i odd */
	for (i=0; b; i++) {
		q = a / b;
		r = a % b;
		a = b, b = r;
		r = x;
		x = y - q*x;
		y = r;
	}
	/* not invertible */
	if (a > 1) return 0;
	if (i & 1) y += p;
	return y;
}


/* (a0:a1) += (b0:b1) return carry */
UINT uadd2(UINT *a0, UINT *a1, UINT b0, UINT b1)
{
#if defined(MSCx64)
	unsigned char c;
	c = _addcarry_u64(0,*a0,b0,a0);
	c = _addcarry_u64(c,*a1,b1,a1);
	return c;
#else
	UINT c, d, e;
	*a0 = *a0 + b0;
	  c = *a0 < b0;
	*a1 = *a1 + b1;
	  d = *a1 < b1;
	*a1 = *a1 + c;
	  e = *a1 < c;
	return d | e;
#endif
}


/* (a0:a1) -= (b0:b1) return borrow */
UINT usub2(UINT *a0, UINT *a1, UINT b0, UINT b1)
{
#if defined(MSCx64)
	unsigned char c;
	c = _subborrow_u64(0,*a0,b0,a0);
	c = _subborrow_u64(c,*a1,b1,a1);
	return c;
#else
	UINT t1, c, d, e;
	b0 = *a0 - b0;
	 c = *a0 < b0;
	b1 = *a1 - b1;
	 d = *a1 < b1;
	t1 =  b1 - c;
	 e =  b1 < t1;
	*a0 = b0;
	*a1 = t1;
	return d | e;
#endif
}


/* (lo:hi) = a*b */
UINT umul2(UINT a, UINT b, UINT *hi)
{
#if WORDSIZE==32
	UINT64 t0 = (UINT64)(a)*(b);
	*hi = (t0 >> 32);
	return t0;

#elif defined(MSCx64)
	return _umul128(a,b,hi);

#elif defined(GNUx64)
	__asm__("mulq %1" : "=a"(a), "=d"(b) : "0"(a), "1"(b) : "cc");
	*hi = b; return a;

#elif defined(GNUa64)
	UINT c;
	__asm__("umulh %0, %1, %2" : "=r"(c) : "r"(a), "r"(b) : "cc");
	*hi = c; return a*b;

#elif WORDSIZE==64
	UINT64 a0, b0, a1, b1, r0, r1, r2, t0;
	a0 = a & 0xFFFFFFFF;
	b0 = b & 0xFFFFFFFF;
	a1 = (a >> 32);
	b1 = (b >> 32);
	t0 = a0 * b0;
	r0 = t0 & 0xFFFFFFFF;
	t0 = a1 * b0 + (t0 >> 32);
	r1 = t0 & 0xFFFFFFFF;
	r2 = (t0 >> 32);
	t0 = a0 * b1 + r1;
	/* assign upper 64-bits to hi */
	*hi = a1 * b1 + (t0 >> 32) + r2;
	return (t0 << 32) + r0;
#else
	#error port umul2
#endif
}


/* (lo:hi)/v = q,r from Hacker's Delight */
UINT udiv2(UINT lo, UINT hi, UINT v, UINT *r)
{
#if WORDSIZE==32
	UINT64 t0 = hi;
	t0 = (t0 << 32) + lo;
	if (r) *r = t0 % v;
	return (UINT)(t0 / v);

#elif defined(GNUx64)
	__asm__("divq %4" : "=a"(lo), "=d"(hi) : "0"(lo), "1"(hi), "r"(v) : "cc");
	if (r) *r = hi; return lo;

#elif WORDSIZE==64
	UINT64 b, un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat;
	INT64  s=0;
	b = (UINT64)1 << 32;
	if (hi >= v) {	/* overflow */
		printf("udiv2 overflow\n");
		r = 0;
		*((UINT *)r) = 0;
		if (r) *r = 0xFFFFFFFFFFFFFFFF;
		return 0xFFFFFFFFFFFFFFFF;
	}
	s = ulz(v);
	v = v << s;
	vn1 = v >> 32;
	vn0 = v & 0xFFFFFFFF;
	un32 = (hi << s) | ((lo >> (64 - s)) & ((-s) >> 63));
	un10 = (lo << s);
	un1 = un10 >> 32;
	un0 = un10 & 0xFFFFFFFF;
	q1 = un32/vn1;
	rhat = un32 - q1*vn1;
    q1:	if (q1 >= b || q1*vn0 > b*rhat + un1) {
		q1 = q1 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto q1;
	}
	un21 = un32*b + un1 - q1*v;
	q0 = un21/vn1;
	rhat = un21 - q0*vn1;
    q0:	if (q0 >= b || q0*vn0 > b*rhat + un0) {
		q0 = q0 - 1;
		rhat = rhat + vn1;
		if (rhat < b) goto q0;
	}
	if (r) *r = (un21*b + un0 - q0*v) >> s;
	return q1*b + q0;
#else
	#error port udiv2
#endif
}


/* reciprocal of divisor */
/* it must be shifted up */
UINT nreciprocal(UINT d)
{
	UINT u0, u1, q;
	static UINT D = 0;
	static UINT Q = 0;
	if (d==D) return Q;
	u0 = -1; u1 = -d-1;
	q = udiv2(u0,u1,d,0);
	D = d, Q = q;
	return q;
}


/* 2/1 division from Moller and Granlund 2011 */
/* d is a normalized divisor with top bit set */
/* v = nreciprocal(d) divide u0 + u1*2^B by d */
UINT ndiv2(UINT u0, UINT u1, UINT d, UINT v, UINT *R)
{
	UINT q0, q1, r;
	q0 = umul2(u1,v,&q1);
	q0 += u0;
	q1 += u1 + (q0 < u0) + 1;
	r = u0 - q1*d;
	if (r > q0) q1--, r += d;
	if (r >= d) q1++, r -= d;
	if (R) *R = r;
	return q1;
}


/* multiply a * b mod c */
/* directly divide by c */
UINT umulmod(UINT a, UINT b, UINT c)
{
	/* (a:b) = a * b */
	/* a = (a:b) % c */
	a = umul2(a,b,&b);
	udiv2(a,b,c,&a);
	return a;
}

/* mulmod a * b mod c where */
/* u = c << w is normalized */
/* and v = nreciprocal of u */
/* and both a < c and b < c */
UINT nmulmod(UINT a, UINT b, UINT u, UINT v, UINT w)
{
	/* (a:b) = (a*b) << w */
	a = umul2(a << w, b, &b);
	ndiv2(a,b,u,v,&a);
	return a >> w;
}


/* a^b mod c, binary method */
/* u = c << w is normalized */
/* and v = nreciprocal of u */
UINT npowmod(UINT a, UINT b, UINT u, UINT v, UINT w)
{
	UINT r=1, s;
	s = a % (u >> w);
	if (b & 1) r = s;
	while (b >>= 1) {
		s = nmulmod(s,s,u,v,w);
		if (b & 1) r = nmulmod(r,s,u,v,w);
	}
	return r;
}


/* a^b mod c, binary powering */
UINT upowmod(UINT a, UINT b, UINT c)
{
	UINT u, v, w;
	w = ulz(c), u = c << w, v = nreciprocal(u);
	return npowmod(a,b,u,v,w);
}


/* a+b mod c, a and b are reduced */
UINT uaddmod(UINT a, UINT b, UINT c)
{
	return (a < c-b ? a+b : a+b-c);
}


/* a-b mod c, a and b are reduced */
UINT usubmod(UINT a, UINT b, UINT c)
{
	return (a < b ? a-b+c : a-b);
}



/*
	Parallel Programming 
*/

/* Windows 32-bit with MSVC */
#if MULTI_THREAD && defined(_MSC_VER) && !defined(_WIN64)
	#include <time.h>
	#include <intrin.h>

	void __stdcall GetSystemInfo(void *sysinfo);
	UINT __stdcall CreateThread(void *ta, UINT ss, void *f, void *x, int dw, UINT *id);
	 int __stdcall WaitForSingleObject(UINT id, int ms);

	#define cas32(ptr,old,new)	_InterlockedCompareExchange(ptr,new,old)
	#define cas(ptr,old,new)	cas32(ptr,old,new)

	INT numcpu()
	{
		unsigned int x[20] = {0};
		static INT ncpu = 0;
		if (ncpu) goto done;
		GetSystemInfo(&x);
		ncpu = x[5];
		/* override the number of threads */
		if (num_thread > 0) ncpu = num_thread;
	done:	return ncpu;
	}

	INT thread(void *f, void *x)
	{
		return CreateThread(0,0,f,x,0,0);
	}

	INT join(INT id)
	{
		WaitForSingleObject(id,-1);
		return 0;
	}

	double realtime()
	{
		double t = clock();
		return t / CLOCKS_PER_SEC;
	}

/* Windows 64-bit with MSVC */
#elif MULTI_THREAD && defined(_MSC_VER) && defined(_WIN64)
	#include <time.h>
	#include <intrin.h>

	void __stdcall GetSystemInfo(void *sysinfo);
	UINT __stdcall CreateThread(void *ta, UINT ss, void *f, void *x, int dw, UINT *id);
	 int __stdcall WaitForSingleObject(UINT id, int ms);

	#define cas32(ptr,old,new)	_InterlockedCompareExchange(ptr,new,old)
	#define cas(ptr,old,new)	_InterlockedCompareExchange64(ptr,new,old)

	INT numcpu()
	{
		unsigned int x[20] = {0};
		static INT ncpu = 0;
		if (ncpu) goto done;
		GetSystemInfo(&x);
		ncpu = x[8];
		/* override the number of threads */
		if (num_thread > 0) ncpu = num_thread;
	done:	return ncpu;
	}

	INT thread(void *f, void *x)
	{
		return CreateThread(0,0,f,x,0,0);
	}

	INT join(INT id)
	{
		WaitForSingleObject(id,-1);
		return 0;
	}

	double realtime()
	{
		double t = clock();
		return t / CLOCKS_PER_SEC;
	}

/* Windows 64-bit with GCC */
#elif MULTI_THREAD && defined(__GNUC__) && defined(_WIN64)
	#include <time.h>

	void __stdcall GetSystemInfo(void *sysinfo);
	UINT __stdcall CreateThread(void *ta, UINT ss, void *f, void *x, int dw, UINT *id);
	 int __stdcall WaitForSingleObject(UINT id, int ms);

	#define cas32(ptr,old,new)	__sync_val_compare_and_swap((INT32 *)ptr,old,new)
	#define cas(ptr,old,new)	__sync_val_compare_and_swap(ptr,old,new)

	INT numcpu()
	{
		unsigned int x[20] = {0};
		static INT ncpu = 0;
		if (ncpu) goto done;
		GetSystemInfo(&x);
		ncpu = x[8];
		/* override the number of threads */
		if (num_thread > 0) ncpu = num_thread;
	done:	return ncpu;
	}

	INT thread(void *f, void *x)
	{
		return CreateThread(0,0,f,x,0,0);
	}

	INT join(INT id)
	{
		WaitForSingleObject(id,-1);
		return 0;
	}

	double realtime()
	{
		double t = clock();
		return t / CLOCKS_PER_SEC;
	}

/* Mac OS */
#elif MULTI_THREAD && defined(__APPLE__)
	#include <sys/time.h>
	#include <unistd.h>
	#include <pthread.h>

	#define cas32(ptr,old,new)	__sync_val_compare_and_swap((INT32 *)ptr,old,new)
	#define cas(ptr,old,new)	__sync_val_compare_and_swap(ptr,old,new)

	INT numcpu()
	{
		static INT ncpu = 0;
		if (ncpu) goto done;
		ncpu = sysconf(_SC_NPROCESSORS_CONF);
		/* override the number of threads */
		if (num_thread > 0) ncpu = num_thread;
	done:	return ncpu;
	}

	INT thread(void *f, void *x)
	{
		int err;
		pthread_t pid;
		err = pthread_create(&pid,0,f,x);
		return (INT)pid;
	}

	INT join(INT id)
	{
		INT val;
		int err;
		pthread_t pid = (pthread_t)id;
		err = pthread_join(pid, (void **)(&val));
		return val;
	}

	double realtime()
	{
		struct timeval t;
		gettimeofday(&t,0);
		return 1.0 * t.tv_sec + 0.000001 * t.tv_usec;
	}

/* Linux */
#elif MULTI_THREAD && defined(__linux__)
	#include <sys/time.h>
	#include <unistd.h>
	#include <pthread.h>

	#define cas32(ptr,old,new)	__sync_val_compare_and_swap((INT32 *)ptr,old,new)
	#define cas(ptr,old,new)	__sync_val_compare_and_swap(ptr,old,new)

	INT numcpu()
	{
		static INT ncpu = 0;
		if (ncpu) goto done;
		ncpu = sysconf(_SC_NPROCESSORS_CONF);
		/* override the number of threads */
		if (num_thread > 0) ncpu = num_thread;
	done:	return ncpu;
	}

	INT thread(void *f, void *x)
	{
		int err;
		pthread_t pid;
		err = pthread_create(&pid,0,f,x);
		return (INT)pid;
	}

	INT join(INT id)
	{
		INT val;
		int err;
		pthread_t pid = (pthread_t)id;
		err = pthread_join(pid, (void **)(&val));
		return val;
	}

	double realtime()
	{
		struct timeval t;
		gettimeofday(&t,0);
		return 1.0 * t.tv_sec + 0.000001 * t.tv_usec;
	}

/* sequential */
#else
	#include <time.h>

	INT cas32(INT32 *ptr, INT32 old, INT32 new)
	{
		INT32 tmp = *ptr;
		if (tmp==old) *ptr = new;
		return tmp;
	}

	INT cas(INT *ptr, INT old, INT new)
	{
		INT tmp = *ptr;
		if (tmp==old) *ptr = new;
		return tmp;
	}

	INT numcpu()
	{
		return 1;
	}

	INT thread(void *f, void *x)
	{
		void (*g)(void *) = f;
		g(x);
		return 0;
	}

	INT join(INT id)
	{
		return 0;
	}

	double realtime()
	{
		double t = clock();
		return t / CLOCKS_PER_SEC;
	}
	
#endif


/*
	cmalloc

	track memory allocations in multithreaded C code
	acquiring memory adds blocks to a lock-free stack
	releasing memory is done all at once at the end
*/

INT cmalloc_list = 0;

void * cmalloc(size_t s)
{
	UINT *x, y, z;
	x = malloc(s + sizeof(INT));
	if (!x) goto fail;

retry:	y = cmalloc_list; x[0] = y;
	z = cas(&cmalloc_list, y, U(x));
	if (z != y) goto retry;
	return (void *)(x+1);

fail:	printf("cmalloc: out of memory\n");
	exit(1);
}

void cmalloc_free()
{
	UINT *x, *y;
	x = (UINT *)cmalloc_list;
	while (x) {
		y = (UINT *)(x[0]);
		free(x);
		x = y;
	}
	cmalloc_list = 0;
}


/*	F4 Algorithm

	Monomials are integers referring to exponent vectors stored in the f4_monom block of memory.
	EXPVEC(m) returns the start of the exponent vector and COLUMN(m) returns the column number.
	EXPVEC(0) is scratch space.  Monomials are added to a hash table to make them unique.

	Polynomials have an array of monomials and a array of coefficients plus other information.
	These are allocated using cmalloc and freed at the end of the algorithm.

	Matrix rows have an array of coefficients (not duplicated) and an array of indices encoded as follows:
	it first records an index using an entire word, this is followed by a sequence of unsigned characters
	recording differences from 1 to 255 from the previous index, ending in the zero character.
	These are allocated using malloc and freed when the matrix rows are filtered and decoded.
*/

typedef struct f4row {
	INT32 len;	/* number of terms in the matrix row */
	INT32 fac;	/* monomial cofactor from the syzygy */
	INT  *cof;	/* an array of coefficients modulo p */
	INT32 *mon;	/* array of monomials from the basis */
	CHAR *ind;	/* array of column index differences */
	INT32 siz;	/* bytes of encoded sparsity pattern */
	INT32 zzz;	/* divmask or zero if poly redundant */
} f4row;

typedef struct f4syz {
	INT32 lcm;	/* the lead monomial to be cancelled */
	f4row *row0;	/* pointers to the basis polynomials */
	f4row *row1;
} f4syz;

INT	f4_prime;		/* the current prime */
INT32	f4_nvars;		/* total # variables */
INT32	f4_nelim;		/* # for elimination */

INT32  *f4_table, f4_tsize;	/* hash table & size */
INT32  *f4_monom, f4_mload;	/* array & num. used */
INT32	f4_mutex;		/* hash & array lock */

f4row **f4_basis;		/* basis polynomials */
INT32   f4_bload, f4_bsize;	/* num. used / total */

f4syz **f4_pairs;		/* array of syzygies */
INT32   f4_pload, f4_psize;	/* num. used / total */

f4row **f4_array;		/* big sparse matrix */
INT32	f4_aload, f4_asize;	/* rows used / total */

INT32  *f4_mused;		/* monomials present */
INT32	f4_uload, f4_usize;	/* cols used / total */

INT32	f4_limit = 2048;	/* select pair limit */
INT32	f4_dprev = 0;		/* previous pair deg */

INT32  *f4_hashv = 0;		/* hash combinations */


/* monomial exponent, column index */
#define EXPVEC(m) (f4_monom + U(m)*(f4_nvars+2))
#define COLUMN(m) (f4_monom + U(m)*(f4_nvars+2))[f4_nvars]
#define LEXCOL(m) (f4_monom + U(m)*(f4_nvars+2))[f4_nvars+1]


/*
	monomial hash table
*/

/* initialize */
void f4_mon_init(INT nvars, INT nelim)
{
	INT32 m, s, i, j;

	if (nelim > nvars) nelim = nvars;
	if (nelim < 0) nelim = 0;

	f4_nvars = nvars;
	f4_nelim = nelim;

	/* monomial length */
	m = f4_nvars+2;

	f4_hashv = malloc(nvars*sizeof(INT32));
	for (i=0; i < nvars; i++) f4_hashv[i] = urandom32() >> 1;

	/* must be power of two */
	f4_tsize = s = I(1) << 20;
	f4_table = malloc(2*s*sizeof(INT32));
	f4_monom = malloc(m*s*sizeof(INT32));
	f4_mload = 1;	/* EXPVEC(0) == scratch */
	for (i=0; i < 2*s; i++) f4_table[i] = 0;
	for (i=0; i < m*s; i++) f4_monom[i] = 0;
	f4_mutex = 0;
}

/* dispose */
void f4_mon_free()
{
	free(f4_table); free(f4_monom); free(f4_hashv);
	f4_table = f4_monom = 0;
	f4_tsize = f4_mload = 0;
	f4_nvars = 0;
	f4_mutex = 0;
}

/* reload monoms */
void f4_mon_rehash()
{
	INT32 *e, m, h, s, n, i, k;
	n = f4_nvars; s = f4_tsize;
	for (i=0; i < 2*s; i++) f4_table[i] = 0;
	for (m=1; m < f4_mload; m++) {
		e = EXPVEC(m);
		for (h=1,i=0; i < n; i++) h += U(e[i]) * U(f4_hashv[i]);
		for (k=h,i=0; i < s; i++) {
			k = (k+i) & (s-1);
			if (f4_table[2*k] == 0) break;
		}
		f4_table[2*k+0] = (INT32)(h);
		f4_table[2*k+1] = m;
	}
}

/* enlarge table */
void f4_mon_resize()
{
	INT32 m, s;
	s = f4_tsize;
	m = f4_nvars+2;
	s = f4_tsize = 2*s;
	f4_table = realloc(f4_table, 2*s*sizeof(INT32));
	f4_monom = realloc(f4_monom, m*s*sizeof(INT32));
	f4_mon_rehash();
}


/* 
	monomial operations
*/

/* create monomial */
INT f4_mon_new(INT32 *e)
{
	INT32 *T, *v, n, s, h, i, j, k, m, t;
	T = f4_table; s = f4_tsize; n = f4_nvars;
	for (h=1,i=0; i < n; i++) h += U(e[i]) * U(f4_hashv[i]);
	for (k=h,i=0; i < s; i++) {
		k = (k+i) & (s-1);
		if (T[2*k+0] == 0) break;
		if (T[2*k+0] != h) continue;
		m = T[2*k+1]; v = EXPVEC(m);
		/* check that the exponents equal */
		for (j=0; j < n && e[j]==v[j]; j++) ;
		if (j==n) return m;
	}
	m = f4_mload++;
	T[2*k+1] = m; T[2*k+0] = h; v = EXPVEC(m);
	for (j=0; j < n; j++) v[j] = e[j]; v[j] = 0;
	if (m+1 == s/2) f4_mon_resize();
	return m;
}

/* new monomial in thread */
INT f4_mon_new_thread(INT32 *e)
{
	INT32 *T, *v, n, s, h, i, j, k, m;
	T = f4_table; s = f4_tsize; n = f4_nvars;
	for (h=1,i=0; i < n; i++) h += U(e[i]) * U(f4_hashv[i]);
retry:	for (k=h,i=0; i < s; i++) {
		k = (k+i) & (s-1);
		if (T[2*k+0] == 0) break;
		if (T[2*k+0] != h) continue;
		m = T[2*k+1]; v = EXPVEC(m);
		/* check that the exponents equal */
		for (j=0; j < n && e[j]==v[j]; j++) ;
		if (j==n) return m;
	}
	if (cas32(&f4_mutex,0,1)) goto retry;
	for (k=h,i=0; i < s; i++) {
		k = (k+i) & (s-1);
		if (T[2*k+0] == 0) break;
		if (T[2*k+0] != h) continue;
		m = T[2*k+1]; v = EXPVEC(m);
		/* check that the exponents equal */
		for (j=0; j < n && e[j]==v[j]; j++) ;
		if (j==n) goto done;
	}
	m = f4_mload++; v = EXPVEC(m);
	for (j=0; j < n; j++) v[j] = e[j]; v[j] = 0;
	T[2*k+1] = m; cas32(&T[2*k+0],0,h);
done:	f4_mutex = 0;
	return m;
}

/* compare two monomials */
INT32 f4_mon_cmp(INT32 A, INT32 B)
{
	INT32 da, db;
	INT32 *a, *b, n, e, i, j, k;
	if (A==B) return 0;
	a = EXPVEC(A);
	b = EXPVEC(B);
	n = f4_nvars;
	e = f4_nelim;

	if (e >= n) goto plex;
	i = j = 0; k = e;
	if (e) goto comp;
last:	i = j = e; k = n;

comp:	/* grevlex order */
	for (da=db=0; i < k; i++) {
		da += a[i];
		db += b[i];
	}
	if (da != db) return (da-db);
	while (--i > j) {
		if (a[i] != b[i]) return (b[i]-a[i]);
	}
	if (k==n) return 0;
	goto last;

plex:	for (i=0; i < n; i++) {
		if (a[i] != b[i]) return (a[i] - b[i]);
	}
	return 0;
}

/* total degree */
INT32 f4_mon_deg(INT32 A)
{
	INT32 *a, n, i, d;
	n = f4_nvars;
	a = EXPVEC(A);
	for (d=i=0; i < n; i++) d += a[i];
	return d;
}

/* division test */
INT32 f4_mon_div(INT32 A, INT32 B)
{
	INT32 *a, *b, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B);
	for (i=n-1; i >= 0; i--) {
		if (a[i] < b[i]) return 0;
	}
	return 1;
}

/* multiply */
INT32 f4_mon_mul(INT32 A, INT32 B)
{
	INT32 *a, *b, *c, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B); c = EXPVEC(0);
	for (i=0; i < n; i++) c[i] = a[i] + b[i];
	return f4_mon_new(c);
}

/* multiply in thread */
INT32 f4_mon_mul_thread(INT32 A, INT32 B, INT32 *tmp)
{
	INT32 *a, *b, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B);
	for (i=0; i < n; i++) tmp[i] = a[i] + b[i];
	return f4_mon_new_thread(tmp);
}

/* quotient */
INT32 f4_mon_quo(INT32 A, INT32 B)
{
	INT32 *a, *b, *c, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B); c = EXPVEC(0);
	for (i=0; i < n; i++) c[i] = a[i] - b[i];
	return f4_mon_new(c);
}

/* quotient in thread */
INT32 f4_mon_quo_thread(INT32 A, INT32 B, INT32 *tmp)
{
	INT32 *a, *b, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B);
	for (i=0; i < n; i++) tmp[i] = a[i] - b[i];
	return f4_mon_new_thread(tmp);
}

/* greatest common divisor */
INT32 f4_mon_gcd(INT32 A, INT32 B)
{
	INT32 *a, *b, *c, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B); c = EXPVEC(0);
	for (i=0; i < n; i++) c[i] = a[i] > b[i] ? b[i] : a[i];
	return f4_mon_new(c);
}

/* least common multiple */
INT32 f4_mon_lcm(INT32 A, INT32 B)
{
	INT32 *a, *b, *c, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B); c = EXPVEC(0);
	for (i=0; i < n; i++) c[i] = a[i] > b[i] ? a[i] : b[i];
	return f4_mon_new(c);
}

/* test relatively prime */
INT32 f4_mon_prm(INT32 A, INT32 B)
{
	INT32 *a, *b, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B);
	for (i=0; i < n; i++) {
		if (a[i] && b[i]) return 0;
	}
	return 1;
}

/* test A depends on B */
INT32 f4_mon_dep(INT32 A, INT32 B)
{
	INT32 *a, *b, n, i;
	n = f4_nvars;
	a = EXPVEC(A); b = EXPVEC(B);
	for (i=0; i < n; i++) {
		if (!a[i] && b[i]) return 0;
	}
	return 1;
}

/* variable x[k] */
INT32 f4_mon_var(INT32 k)
{
	INT32 *a, n, i;
	n = f4_nvars;
	a = EXPVEC(0);
	for (i=0; i < n; i++) a[i] = (i==k);
	return f4_mon_new(a);
}

/* constant */
INT32 f4_mon_one()
{
	INT32 *a, n, i;
	n = f4_nvars;
	a = EXPVEC(0);
	for (i=0; i < n; i++) a[i] = 0;
	return f4_mon_new(a);
}

/* sort array of monomials */
void f4_mon_sort(INT32 *L, INT32 l)
{
	INT32 i, j, k, m;
	k = l;
	if (k < 2) return;
sort:	k = 5*(k+1)/13;
	for (i=k-1; i < l; i++) {
		m = L[i];
		for (j=i; j >= k; j-=k) {
			if (f4_mon_cmp(m,L[j-k]) > 0) break;
			L[j] = L[j-k];
		}
		L[j] = m;
	}
	if (k > 1) goto sort;
}


/*
	polynomial operations
*/

f4row * f4row_new()
{
	f4row *a;
	a = cmalloc(sizeof(f4row));
	memset((void *)a,0,sizeof(f4row));
	return a;
}

/* sort polynomial */
f4row * f4row_sort(f4row *a)
{
	INT32 i, j, k, l, m;
	INT c;
	l = k = a->len;
	if (k < 2) goto done;
sort:	k = 5*(k+1)/13;
	for (i=k-1; i < l; i++) {
		m = a->mon[i];
		c = a->cof[i];
		for (j=i; j >= k; j-=k) {
			if (f4_mon_cmp(a->mon[j-k],m) >= 0) break;
			a->mon[j] = a->mon[j-k];
			a->cof[j] = a->cof[j-k];
		}
		a->mon[j] = m;
		a->cof[j] = c;
	}
	if (k > 1) goto sort;
done:	return a;
}


/*
	pair operations
*/

void f4_memory()
{
	f4row *a, *b;
	INT32 *e, *f, i, j, k, l, m, s=0;

	/* garbage collect monomials */
	for (i=0; i < f4_mload; i++) COLUMN(i) = 0;
	for (i=0; i < f4_bload; i++) {
		a = f4_basis[i];
		for (j=0; j < a->len; j++) {
			m = a->mon[j];
			COLUMN(m) = 1;
		}
	}
	for (i=0; i < f4_pload; i++) {
		m = f4_pairs[i]->lcm;
		COLUMN(m) = 1;
	}

	/* record new positions */
	for (i=j=1; i < f4_mload; i++) {
		if (COLUMN(i)) COLUMN(i) = j++;
	}
	for (i=0; i < f4_bload; i++) {
		a = f4_basis[i];
		for (j=0; j < a->len; j++) {
			m = COLUMN(a->mon[j]);
			a->mon[j] = m;
		}
	}
	for (i=0; i < f4_pload; i++) {
		m = f4_pairs[i]->lcm;
		f4_pairs[i]->lcm = COLUMN(m);
	}
	for (i=j=1; i < f4_mload; i++) {
		if (COLUMN(i)) {
			e = EXPVEC(i);
			f = EXPVEC(j);
			for (k=0; k < f4_nvars; k++) f[k] = e[k];
			COLUMN(j) = 0;
			j++;
		}
	}
	f4_mload = j;
	f4_mon_rehash();
}

/* add poly to basis */
void f4_update(f4row *a)
{
	f4row *b;
	f4syz *s;
	INT32 i, j, k, l;
	INT32 d0, d1, d2;

	/* room for new polys */
	if (f4_bload + 1 > f4_bsize) {
		f4_bsize = 3*f4_bsize/2;
		f4_basis = realloc(f4_basis, f4_bsize*sizeof(f4row *));
	}

	/* room for new pairs */
	if (f4_pload + f4_bload > f4_psize) {
		f4_psize = 2*f4_psize + f4_bload;
		f4_pairs = realloc(f4_pairs, f4_psize*sizeof(f4syz *));
	}

	/* generate new pairs */
	for (i=l=0; i < f4_bload; i++) {
		b = f4_basis[i];
		if (b->zzz) continue;
		/* Buchberger's first criteria */
		if (f4_mon_prm(a->mon[0],b->mon[0])) continue;
		s = cmalloc(sizeof(f4syz));
		s->lcm = f4_mon_lcm(b->mon[0],a->mon[0]);
		s->row0 = b; s->row1 = a;
		f4_pairs[f4_pload+l] = s;
		l++;	/* # new pairs */
	}

	/* Gebauer & Moller criteria B */
	for (i=0; i < f4_pload; i++) {
		d0 = f4_pairs[i]->lcm;
		d1 = f4_pairs[i]->row0->mon[0];
		d2 = f4_pairs[i]->row1->mon[0];
		if (f4_mon_div(d0,a->mon[0])
		 && f4_mon_lcm(d1,a->mon[0]) != d0
		 && f4_mon_lcm(d2,a->mon[0]) != d0) {
			f4_pairs[i]->lcm = 0;
		}
	}

	/* Gebauer & Moller criteria M */
	for (i=0; i < l; i++) {
		d0 = f4_pairs[f4_pload+i]->lcm;
		if (d0==0) continue;
		for (j=0; j < l; j++) {
			d1 = f4_pairs[f4_pload+j]->lcm;
			if (i==j || d0==d1 || d1==0) continue;
			if (f4_mon_div(d1,d0)) f4_pairs[f4_pload+j]->lcm = 0;
		}
	}

	/* Gebauer & Moller criteria F */
	for (i=0; i < l; i++) {
		d0 = f4_pairs[f4_pload+i]->lcm;
		if (d0==0) continue;
		for (j=i+1; j < l; j++) {
			d1 = f4_pairs[f4_pload+j]->lcm;
			if (d1==0) continue;
			if (d0==d1) f4_pairs[f4_pload+j]->lcm = 0;
		}
	}

	/* compact the set of pairs */
	for (i=j=0; i < f4_pload+l; i++) {
		d0 = f4_pairs[i]->lcm;
		if (d0 == 0) continue;
		f4_pairs[j] = f4_pairs[i];
		j++;
	}
	f4_pload = j;

	/* cull basis elements */
	for (i=j=0; i < f4_bload; i++) {
		b = f4_basis[i];
		if (f4_nelim==0 && f4_mon_div(b->mon[0],a->mon[0])) {
			b->zzz = 1;
		}
	}
	/* add new polynomial */
	f4_basis[f4_bload] = a;
	f4_bload++;
}

/* select pairs */
void f4_select()
{
	f4syz *s, *t;
	f4row **A, *a, *b;
	INT32 d, e, i, j, k, l;

	/* degree, # of pairs */
	d = (INT)1 << 30; k = 0;
	for (i=0; i < f4_pload; i++) {
		s = f4_pairs[i];
		e = f4_mon_deg(s->lcm);
		if (e > d) continue;
		else if (e == d) k++;
		else d = e, k = 1;
	}
	l = k;

	if (d == f4_dprev) {
		f4_limit *= 2;
	}
	else {
		f4_limit = 2048;
		f4_dprev = d;
	}
	if (k > f4_limit) k = f4_limit;

	if (info >= 2) {
		printf("degree=%lld pairs=%lld/%lld with pairs limit %lld\n",
			(long long int)d,
			(long long int)k, 
			(long long int)f4_pload,
			(long long int)f4_limit
		);
	}

	/* allocate matrix */
	if (2*k > f4_asize) {
		f4_array = realloc(f4_array, 2*k*sizeof(f4row *));
		f4_asize = 2*k;
	}
	f4_aload = 0;

	/* seed matrix using pairs */
	for (i=j=0; i < f4_pload; i++) {
		s = f4_pairs[i];
		e = f4_mon_deg(s->lcm);
		if (e > d) continue;
		if (j >= 2*k) continue;

		a = f4row_new();
		b = f4row_new();
		a->len = s->row0->len;
		b->len = s->row1->len;
		a->fac = f4_mon_quo(s->lcm, s->row0->mon[0]);
		b->fac = f4_mon_quo(s->lcm, s->row1->mon[0]);
		a->cof = s->row0->cof;
		b->cof = s->row1->cof;
		a->mon = s->row0->mon;
		b->mon = s->row1->mon;
		a->ind = b->ind = 0;
		f4_array[j++] = a;
		f4_array[j++] = b;

		s->lcm = 0;
	}
	f4_aload = j;

	/* remove selected pairs */
	for (i=j=0; i < f4_pload; i++) {
		s = f4_pairs[i];
		if (s->lcm == 0) ;
		else f4_pairs[j++] = s;
	}
	f4_pload = j;
}

#if MULTI_THREAD

void f4_symbol_thread(void *par)
{
	f4row *a, *b, *c;
	INT32 *param = par;
	INT32 i, j, k, l, m, q, *tmp;
	INT32 ss, st, sz;
	INT32 y, z;

	tmp = 0;
	sz = param[1];
	st = param[0];
start:	ss = cas32(param,st,st+1);
	if (ss != st) {
		st = ss;
		goto start;
	}
	if (st >= sz) goto done;

	if (!tmp) tmp = malloc(f4_nvars*sizeof(INT32));
	a = f4_array[st];
	for (i=0; i < a->len; i++) {
		m = f4_mon_mul_thread(a->fac, a->mon[i], tmp);
		if (COLUMN(m)==1) continue;
		if (cas32(&COLUMN(m),0,1)) continue;

		/* add m to f4_mused atomically */
		z = f4_uload;
	monom:	y = cas32(&f4_uload,z,z+1);
		if (y != z) {
			z = y;
			goto monom;
		}
		f4_mused[z] = m;

		/* we acquired m to process */
		for (k=0; k < f4_bload; k++) {
			b = f4_basis[k];
			if (b->zzz) continue;
			if (f4_mon_div(m,b->mon[0])) break;
		}
		if (k==f4_bload) continue;

		b = f4_basis[k];
		q = f4_mon_quo_thread(m,b->mon[0],tmp);

		/* construct matrix row */
		c = f4row_new();
		c->len = b->len;
		c->fac = q;
		c->cof = b->cof;
		c->mon = b->mon;
		c->ind = 0;

		/* add c to f4_array atomically */
		z = f4_aload;
	array:	y = cas32(&f4_aload,z,z+1);
		if (y != z) {
			z = y;
			goto array;
		}
		f4_array[z] = c;
	}
	goto start;
done:	free(tmp);
}

/* symbolic preprocessing */
void f4_symbol()
{
	INT l;
	INT32 i, j, k, m, n, x, y, z, s, bs, dp, ncpu, big;
	INT32 param[2] = {0};
	INT tid[MAX_CPU] = {0};

	f4_uload = 0;
	f4_usize = 2*f4_aload;
	f4_mused = malloc(f4_usize*sizeof(INT32));

	for (i=0; i < f4_aload; i=j) {
		/* count remaining monomials */
		for (l=0, j=i; j < f4_aload; j++) {
			l += f4_array[j]->len;
			if (l >= f4_tsize/2) break;
		}

		/* enlarge hash table */
		while (f4_mload + l >= 3*f4_tsize/4) f4_mon_resize();

		/* enlarge array of monoms */
		if (f4_uload + l >= f4_usize) {
			f4_usize = 2*f4_usize + l;
			f4_mused = realloc(f4_mused, f4_usize*sizeof(INT32));
		}

		/* enlarge the matrix rows */
		if (f4_aload + l >= f4_asize) {
			f4_asize = 2*f4_asize + l;
			f4_array = realloc(f4_array, f4_asize*sizeof(f4row *));
		}

		param[0] = i;
		param[1] = j;

		ncpu = umin(numcpu(),MAX_CPU);
		for (k=0; k < ncpu; k++) tid[k] = thread(f4_symbol_thread, (void *)param);
		for (k=0; k < ncpu; k++) join(tid[k]);

		while (f4_mload >= f4_tsize/2) f4_mon_resize();
	}

	if (info >= 2) {
		for (l=0, i=0; i < f4_aload; i++) l += f4_array[i]->len;
		printf("%lld x %lld with %lld non-zero, %.1f per row\n", 
			(long long int)f4_aload,
			(long long int)f4_uload,
			(long long int)l,
			(double)l/f4_aload
		);
	}
}

#else

/* symbolic preprocessing */
void f4_symbol()
{
	f4row *a, *b, *c;
	INT32 i, j, k, m, n, x, y, z, s, t;
	INT l;

	f4_uload = 0;
	f4_usize = 2*f4_aload;
	f4_mused = malloc(f4_usize*sizeof(INT32));

	for (l=0, i=0; i < f4_aload; i++) {
		a = f4_array[i];
		l += a->len;

		/* make room for new cols */
		if (f4_uload + a->len > f4_usize) {
			f4_usize = 2*f4_usize + a->len;
			f4_mused = realloc(f4_mused,f4_usize*sizeof(INT32));
		}
		/* make room for new rows */
		if (f4_aload + a->len > f4_asize) {
			f4_asize = 2*f4_asize + a->len;
			f4_array = realloc(f4_array,f4_asize*sizeof(f4row *));
		}
		for (j=0; j < a->len; j++) {
			m = f4_mon_mul(a->fac,a->mon[j]);
			if (COLUMN(m)==1) continue;
			f4_mused[f4_uload++] = m;
			COLUMN(m) = 1;

			/* divide */
			for (k=0; k < f4_bload; k++) {
				b = f4_basis[k];
				if (b->zzz) continue;
				if (f4_mon_div(m,b->mon[0])) break;
			}
			if (k == f4_bload) continue;
			b = f4_basis[k];

			/* add row to matrix */
			c = f4row_new();
			c->len = b->len;
			c->fac = f4_mon_quo(m,b->mon[0]);
			c->cof = b->cof;
			c->mon = b->mon;
			c->ind = 0;
			f4_array[f4_aload++] = c;
		}
	}
	if (info >= 2) {
		printf("%lld x %lld with %lld non-zero, %.1f per row\n", 
			(long long int)f4_aload,
			(long long int)f4_uload,
			(long long int)l,
			(double)l/f4_aload
		);
	}
}

#endif

#if MULTI_THREAD

/* encode matrix rows */
void f4_encode_thread(void *par)
{
	INT32 *param = par;
	unsigned char *buf;
	INT32 i, j, m, n, *tmp;
	INT32 ss, st, ks, kt;
	INT k, l;
	f4row *a;

	buf = 0; tmp = 0;
	st = param[0];	/* row to encode */
row:	ss = cas32(param,st,st+1);
	if (ss != st) {
		st = ss;
		goto row;
	}
	if (st >= f4_aload) goto done;

	n = f4_nvars;
	if (!buf) {
		tmp = malloc(n*sizeof(INT32));
		buf = malloc(f4_uload*(sizeof(INT32)+1));
	}

	/* encode indices for each matrix row, format: */
	/* [ index0, dif0,...,difk, null, index1, ...] */
	/* indices are words and difs / null are bytes */

	a = f4_array[st]; l = -1;
	for (k=0, j=0; j < a->len; j++) {
		/* its already in monom hash table */
		/* so this won't enlarge the table */
		m = f4_mon_mul_thread(a->fac,a->mon[j],tmp);
		m = COLUMN(m);
		if (l == -1) {
			/* store first index */
			*(INT32 *)(buf+k) = m;
			k += sizeof(INT32);
		}
		else if (l > m && l-m <= 255) {
			/* store difference from previous */
			/* in sequence terminated by null */
			buf[k++] = (unsigned char)(l-m);
		}
		else {
			/* null byte to stop the sequence */
			buf[k++] = 0;
			/* store new index */
			*(INT32 *)(buf+k) = m;
			k += sizeof(INT32);
		}
		l = m;
	}
	buf[k++] = 0;	/* extra null */
	a->ind = malloc(k*sizeof(char));
	memcpy(a->ind,buf,k);
	a->siz = k;
	goto row;

done:	free(buf);
	free(tmp);
}

/* form matrix */
void f4_encode()
{
	f4row *a;
	INT32 m;
	INT i, j, k;
	unsigned char *buf;
	INT32 param[1] = {0};
	INT tid[MAX_CPU] = {0};

	/* sort asc, assign columns */
	f4_mon_sort(f4_mused, f4_uload);
	for (i=0; i < f4_uload; i++) {
		m = f4_mused[i];
		COLUMN(m) = i;
	}

	/* start threads */
	k = umin(numcpu(),MAX_CPU);
	for (i=0; i < k; i++) {
		tid[i] = thread(f4_encode_thread,(void *)(&param));
	}
	for (i=0; i < k; i++) join(tid[i]);

	for (i=j=k=0; i < f4_aload; i++) {
		j += f4_array[i]->len;
		k += f4_array[i]->siz;
	}

	if (info >= 3) {
		printf("%.3f bytes per non-zero, matrix encoded in %.3f MB\n",
			(double)k/j,
			(double)k/1024/1024
		);
	}

	/* reset the indices */
	for (i=0; i < f4_uload; i++) {
		m = f4_mused[i];
		COLUMN(m) = 0;
	}
}

#else

/* make matrix */
void f4_encode()
{
	f4row *a;
	INT32 m;
	INT i, j, k, l, s, t;
	unsigned char *buf;

	/* sort asc, assign columns */
	f4_mon_sort(f4_mused, f4_uload);
	for (i=0; i < f4_uload; i++) {
		m = f4_mused[i];
		COLUMN(m) = i;
	}

	/* encode indices for each matrix row, format: */
	/* [ index0, dif0,...,difk, null, index1, ...] */
	/* indices are words and difs / null are bytes */
	buf = malloc(f4_uload*(sizeof(INT32)+1));
	for (s=t=0, i=0; i < f4_aload; i++) {
		a = f4_array[i]; l = -1;
		for (k=j=0; j < a->len; j++) {
			m = f4_mon_mul(a->fac,a->mon[j]);
			m = COLUMN(m);
			if (l == -1) {
				/* store first index */
				*(INT32 *)(buf+k) = m;
				k += sizeof(INT32);
			}
			else if (l > m && l-m <= 255) {
				/* store difference from previous */
				/* in sequence terminated by null */
				buf[k++] = (unsigned char)(l-m);
			}
			else {
				/* null byte to stop the sequence */
				buf[k++] = 0;
				/* store new index */
				*(INT32 *)(buf+k) = m;
				k += sizeof(INT32);
			}
			l = m;
		}
		buf[k++] = 0;	/* extra null */
		a->ind = malloc(k*sizeof(char));
		memcpy(a->ind,buf,k);
		a->siz = k; s += k;
		t += a->len;
	}
	free(buf);

	/* reset the columns */
	for (i=0; i < f4_uload; i++) {
		m = f4_mused[i];
		COLUMN(m) = 0;
	}

	if (info >= 3) {
		printf("%.3f bytes per non-zero, matrix encoded in %.3f MB\n",
			(double)s/t,
			(double)s/1024/1024
		);
	}
}

#endif

/* encode dense vector */
f4row * f4_reduce_export(INT32 n, INT *vec)
{
	f4row *a = 0;
	INT32 i, j, k, l;
	unsigned char *buf;

	if (n < 0) return 0;
	buf = malloc(n*sizeof(INT)+n);
	j = k = 0; l = -1; 
	for (i=n; i >= 0; i--) {
		if (vec[i]==0) continue;
		if (l == -1) {
			*(INT32 *)(buf+k) = i;
			k += sizeof(INT32);
		}
		else if (l-i <= 255) {
			buf[k++] = (unsigned char)(l-i);
		}
		else {
			buf[k++] = 0;
			*(INT32 *)(buf+k) = i;
			k += sizeof(INT32);
		}
		l = i;
		j++;
	}
	if (!j) goto done;

	buf[k++] = 0;	/* extra null */
	a = f4row_new();
	a->len = j;
	a->fac = 0;
	a->cof = cmalloc(j*sizeof(INT));
	for (j=0, i=n; i >= 0; i--) {
		if (vec[i]==0) continue;
		a->cof[j++] = vec[i];
		vec[i] = 0;
	}
	a->mon = 0;
	a->ind = malloc(k*sizeof(char));
	memcpy(a->ind,buf,k);

done:	free(buf);
	return a;
}

/* add a*c to dense vector */
INT f4_reduce_import(INT *vec, f4row *a, INT c)
{
	UINT u, v, w;
	INT  x, y, z, p, p2;
	INT32 i, j, k, s, t;
	p = f4_prime;
	if (!a->len) return -1;
	if (WORDSIZE==64 && p <= 2147483647) {
		i = j = k = 0;
		p2 = ((U(-1) >> 1)/p) * p;
		while (j < a->len) {
			z = *(INT32 *)(a->ind+k); k += sizeof(INT32);
		mul:	x = (a->cof[j++]) * c;
			y = U(vec[z]) + U(x) - U(p2);
			if (y < 0) y += p2;
			t = a->ind[k++];
			s = z; z = z-t;
			vec[s] = y;
			if (t) goto mul;
		}
	}
	else {
		w = ulz(p); u = p << w; v = nreciprocal(u);
		i = j = k = 0;
		while (j < a->len) {
			z = *(INT32 *)(a->ind+k); k += sizeof(INT32);
		mul2:	x = nmulmod(a->cof[j++],c,u,v,w);
			y = U(vec[z]) + U(x) - U(p);
			if (y < 0) y += p;
			t = a->ind[k++];
			s = z; z = z-t;
			vec[s] = y;
			if (t) goto mul2;
		}
	}
	return *(INT32 *)(a->ind);
}

/* reduce vec using pivots */
INT f4_reduce_vector(INT n, INT *vec, f4row **piv)
{
	f4row *a;
	UINT u, v, w;
	INT  x, y, z, c, p, p2;
	INT32 i, j, k, s, t;
	p = f4_prime;
	if (WORDSIZE==64 && p <= 2147483647) {
		p2 = ((U(-1) >> 1)/p) * p;
		for (i=n; i >= 0; i--) {
			c = vec[i]; if (!c) continue;
			c = vec[i] = c % p;
			a = piv[i]; if (!a) continue;
			j = k = 0;
			while (j < a->len) {
				z = *(INT32 *)(a->ind+k); k += sizeof(INT32);
			mul:	x = (a->cof[j++]) * c;
				y = U(vec[z]) - U(x);
				if (y < 0) y += p2;
				t = a->ind[k++];
				s = z; z = z-t;
				vec[s] = y;
				if (t) goto mul;
			}
		}
	}
	else {
		w = ulz(p); u = p << w; v = nreciprocal(u);
		for (i=n; i >= 0; i--) {
			c = vec[i]; if (!c) continue;
			a = piv[i]; if (!a) continue;
			j = k = 0;
			while (j < a->len) {
				z = *(INT32 *)(a->ind+k); k += sizeof(INT32);
			mul2:	x = nmulmod(a->cof[j++],c,u,v,w);
				y = U(vec[z]) - U(x);
				if (y < 0) y += p;
				t = a->ind[k++];
				s = z; z = z-t;
				vec[s] = y;
				if (t) goto mul2;
			}
		}
	}
	return n;
}

/* make row into a pivot */
f4row * f4_reduce_monic(f4row *a)
{
	UINT u, v, w;
	INT  p, c;
	INT32 i;
	p = f4_prime;
	c = uinvmod(a->cof[0],p);
	w = ulz(p); u = p << w; v = nreciprocal(u);
	for (i=0; i < a->len; i++) {
		a->cof[i] = nmulmod(a->cof[i],c,u,v,w);
	}
	return a;
}

#if MULTI_THREAD

/* multi-threaded code */
void f4_reduce_thread(void *par)
{
	INT *param = par;
	INT32 ss, st, sz, bs, zb, zr;
	INT32 i, j, k, l, m, n, o, s;
	f4row **piv, *a, *b;
	INT *vec, p, x;

	piv = (f4row **)(param[4]);
	vec = 0;	/* buffer for reduction */
	zb = param[3];	/* zero-reduction bound */
	bs = param[2];	/* block size to reduce */
	sz = param[1];	/* total number of rows */
	st = param[0];	/* first row in a block */

	/* get block to reduce */
block:	ss = cas(param,st,st+bs);
	if (ss != st) {
		st = ss;
		goto block;
	}
	/* blocks all claimed */
	if (st >= sz) goto done;

	p = f4_prime;
	n = f4_uload;
	if (vec == 0) {
		/* alloc buffer if needed */
		vec = malloc(n*sizeof(INT));
		for (i=0; i < n; i++) vec[i] = 0;
	}

	/* reduce combinations of */
	/* rows st..min(st+bs,sz) */
	k = umin(sz-st,bs); zr = 0;
	for (o=j=0; j < k; j++) {
		/* random combination */
		for (l=0; l < k; l++) {
			x = (urandom() % (p-1)) + 1;
			s = f4_reduce_import(vec, f4_array[st+l], x);
			if (s > o) o = s;
		}
		/* row reduce */
	retry:	f4_reduce_vector(o,vec,piv);
		a = f4_reduce_export(o,vec);
		if (!a) zr++;
		if (zr==zb) break;
		if (!a) continue;
		a = f4_reduce_monic(a);
		x = *(INT32 *)(a->ind);
		/* assign pivot by compare and swap */
		if (cas((INT *)(piv+x),0,(INT)a) == 0) continue;
		o = f4_reduce_import(vec, a, 1);
		goto retry;
	}
	if (info >= 4) printf("%.1f/", 100.0*(st+k)/sz);
	goto block;
done:	free(vec);
}

/* row reduce */
void f4_reduce()
{
	double e = 1e-18;
	f4row **piv, *a, *b;
	INT32 i, j, k, l, m, n, o, s;
	INT x, y, z, p;
	INT32 zb, zr, bs;
	INT param[5];
	INT tid[MAX_CPU];

	p = f4_prime;
	m = f4_aload; n = f4_uload;
	piv = malloc(n*sizeof(f4row *));
	for (i=0; i < n; i++) piv[i] = 0;

	/* select sparse pivots */
	for (l=i=0; i < m; i++) {
		a = f4_array[i];
		k = *(INT32 *)(a->ind);
		b = piv[k];
		if (!b) piv[k] = a;
		else if (a->len < b->len) {
			piv[k] = a;
			f4_array[l++] = b;
		}
		else f4_array[l++] = a;
	}
	k = m = l;

	/* sort non-pivots */
	if (k < 2) goto done;
sort:	k = 5*(k+1)/13;
	for (i=k-1; i < m; i++) {
		a = f4_array[i];
		x = *(INT32 *)(a->ind);
		for (j=i; j >= k; j-=k) {
			b = f4_array[j-k];
			y = *(INT32 *)(b->ind);
			if (x >= y) break;
			f4_array[j] = b;
		}
		f4_array[j] = a;
	}
	if (k > 1) goto sort;
done:
	/* blocksize */
	bs = 3*uroot(m,3);
	if (bs > m-1) bs = m-1;
	if (bs > p-1) bs = p-1;

	/* zb = bound zero reductions */
	for (zb=0; e < 1; zb++) e = e*p;
	if (bs <= zb) bs = 1;
zb = 1;

	if (info >= 3) {
		printf("%lld rows to reduce, blocksize %lld, max threads %lld, zero reductions %lld\n", 
			(long long int)m,
			(long long int)bs,
			(long long int)( (m+bs-1)/bs ),
			(long long int)zb
		);
	}

	param[0] = 0;		/* start */
	param[1] = m;		/* bound */
	param[2] = bs;		/* block */
	param[3] = zb;		/* zeros */
	param[4] = U(piv);	/* pivot */
	k = umin(numcpu(),MAX_CPU);
	for (i=0; i < k; i++) {
		tid[i] = thread(f4_reduce_thread,(void *)(&param));
	}
	for (i=0; i < k; i++) join(tid[i]);
	if (info >= 4) printf("\n");

	/* extract pivots */
	for (i=j=0; i < n; i++) {
		a = piv[i];
		if (!a) continue;
		f4_array[j++] = a;
	}
	f4_aload = j;
	free(piv);
}

#else

/* row reduce */
void f4_reduce()
{
	double e = 1e-18;
	f4row **piv, *a, *b;
	INT *vec, p;
	INT32 i, j, k, l, m, n, o, s, x, y, z, zb, zr, bs;

	p = f4_prime;
	m = f4_aload; n = f4_uload;
	vec = malloc(n*sizeof(INT));
	piv = malloc(n*sizeof(f4row *));
	for (i=0; i < n; i++) vec[i] = 0;
	for (i=0; i < n; i++) piv[i] = 0;

	/* select sparse pivots */
	for (l=i=0; i < m; i++) {
		a = f4_array[i];
		k = *(INT32 *)(a->ind);
		b = piv[k];
		if (!b) piv[k] = a;
		else if (a->len < b->len) {
			piv[k] = a;
			f4_array[l++] = b;
		}
		else f4_array[l++] = a;
	}
	k = m = l;
	if (k < 2) goto done;
sort:	k = 5*(k+1)/13;
	/* sort remaining rows */
	for (i=k-1; i < m; i++) {
		a = f4_array[i];
		x = *(INT32 *)(a->ind);
		for (j=i; j >= k; j-=k) {
			b = f4_array[j-k];
			y = *(INT32 *)(b->ind);
			if (x >= y) break;
			f4_array[j] = b;
		}
		f4_array[j] = a;
	}
	if (k > 1) goto sort;
done:
	/* blocksize */
	bs = 3*uroot(m,3);
	if (bs > m-1) bs = m-1;
	if (bs > p-1) bs = p-1;

	/* zb = bound zero reductions */
	for (zb=0; e < 1; zb++) e = e*p;
	if (bs <= zb) bs = 1;
zb = 1;

	if (info >= 3) {
		printf("%lld rows to reduce, blocksize %lld, max threads %lld, zero reductions %lld\n", 
			(long long int)m,
			(long long int)bs,
			(long long int)( (m+bs-1)/bs ),
			(long long int)zb
		);
	}

	/* for each block of rows we reduce */
	/* random linear combinations until */
	/* we obtain a zero vector zb times */
	for (i=0; i < m; i+=bs) {
		k = umin(m-i,bs); zr = 0;
		for (o=j=0; j < k; j++) {
			/* random combination */
			for (l=0; l < k; l++) {
				x = (urandom() % (p-1)) + 1;
				s = f4_reduce_import(vec,f4_array[i+l],x);
				if (s > o) o = s;
			}
			/* row reduce */
			f4_reduce_vector(o,vec,piv);
			a = f4_reduce_export(o,vec);
			if (!a) zr++;
			if (zr==zb) break;
			if (!a) continue;
			a = f4_reduce_monic(a);
			x = *(INT32 *)(a->ind);
			piv[x] = a;
		}
		if (info >= 4) printf("%.1f/",100.0*(i+k)/m);
	}
	if (info >= 4) printf("\n");

	/* extract pivots */
	for (i=j=0; i < n; i++) {
		a = piv[i];
		if (!a) continue;
		f4_array[j++] = a;
	}
	f4_aload = j;
	free(vec);
	free(piv);
}

#endif

/* new pivots */
void f4_filter()
{
	f4row *a, *b;
	INT32 i, j, m, n, z, w;
	/* take new leading monomials */
	for (i=j=0; i < f4_aload; i++) {
		a = f4_array[i];
		/* new pivots are not of the form */
		/* basis[i]*m, so a->fac is unset */
		if (a->fac == 0) f4_array[j++] = a;
		else { free(a->ind); a->ind = 0; }
	}
	f4_aload = j;
}

#if MULTI_THREAD

/* multi-threaded code */
void f4_jordan_thread(void *par)
{
	INT *vec, p;
	INT *param = par;
	INT32 ss, st, sz;
	INT32 i, j, k, l, m, n, o, s, x;
	f4row **piv, **red, **arr, *a, *b;

	piv = (f4row **)(param[3]);
	red = (f4row **)(param[2]);
	vec = 0;	/* buffer for reduction */
	sz = param[1];	/* total number of rows */
	st = param[0];	/* first row in a block */

	/* get block to reduce */
block:	ss = cas(param,st,st+1);
	if (ss != st) {
		st = ss;
		goto block;
	}
	/* blocks all claimed */
	if (st >= sz) goto done;

	p = f4_prime;
	n = f4_uload;
	if (vec == 0) {
		/* alloc buffer if needed */
		vec = malloc(n*sizeof(INT));
		for (i=0; i < n; i++) vec[i] = 0;
	}

	/* reduce lower order terms using pivots */
	s = f4_reduce_import(vec, f4_array[st], 1);
	f4_reduce_vector(s-1,vec,piv);
	a = f4_reduce_export(s,vec);
	x = *(INT32 *)(a->ind);
	red[x] = a;

	goto block;
done:	free(vec);
}

/* back substitution */
void f4_jordan()
{
	f4row **piv, **red, *a;
	INT *vec, p;
	INT32 i, j, k, l, m, n;
	INT param[4];
	INT tid[MAX_CPU];

	p = f4_prime;
	m = f4_aload; n = f4_uload;
	piv = malloc(n*sizeof(f4row *));
	red = malloc(n*sizeof(f4row *));
	for (i=0; i < n; i++) piv[i] = 0;
	for (i=0; i < n; i++) red[i] = 0;

	/* create pivots */
	for (l=i=0; i < m; i++) {
		a = f4_array[i];
		k = *(INT32 *)(a->ind);
		piv[k] = a;
	}

	param[0] = 0;		/* start */
	param[1] = m;		/* bound */
	param[2] = U(red);	/* result */
	param[3] = U(piv);	/* pivots */
	k = umin(numcpu(),MAX_CPU);
	for (i=0; i < k; i++) {
		tid[i] = thread(f4_jordan_thread,(void *)(&param));
	}
	for (i=0; i < k; i++) join(tid[i]);

	/* extract pivots */
	for (i=j=0; i < n; i++) {
		a = red[i];
		if (!a) continue;
		f4_array[j++] = a;
	}
	f4_aload = j;
	free(piv);
	free(red);
}

#else

/* back substitution */
void f4_jordan()
{
	f4row **piv, *a;
	INT *vec, p;
	INT32 i, j, k, l, m, n;
	p = f4_prime;
	m = f4_aload; n = f4_uload;
	vec = malloc(n*sizeof(INT));
	piv = malloc(n*sizeof(f4row *));
	for (i=0; i < n; i++) vec[i] = 0;
	for (i=0; i < n; i++) piv[i] = 0;

	/* create pivots */
	for (l=i=0; i < m; i++) {
		a = f4_array[i];
		k = *(INT32 *)(a->ind);
		piv[k] = a;
	}
	/* reduce pivots */
	for (i=0; i < n; i++) {
		a = piv[i];
		if (!a) continue;
		f4_reduce_import(vec,a,1);
		piv[i] = 0;
		f4_reduce_vector(i,vec,piv);
		a = f4_reduce_export(i,vec);
		piv[i] = f4_reduce_monic(a);
	}
	for (j=i=0; i < n; i++) {
		a = piv[i];
		if (!a) continue;
		f4_array[j++] = a;
	}
	f4_aload = j;
	free(vec);
	free(piv);
}

#endif

/* matrix to polynomials */
void f4_decode()
{
	f4row *a;
	INT32 i, j, k, l, m, n, x, y, z;
	for (i=0; i < f4_aload; i++) {
		a = f4_array[i];
		n = a->len;
		if (n==0) continue;
		a->mon = cmalloc(n*sizeof(INT32));
		j = k = 0;
		while (j < a->len) {
			z = *(INT32 *)(a->ind+k); k += sizeof(INT32);
		monom:	a->mon[j] = f4_mused[z];
			l = z; j++;
			if (a->ind[k]) {
				z = l - a->ind[k]; k++;
				goto monom;
			}
			else k++;
		}
		free(a->ind);
		a->ind = 0;
		a->fac = 0;
		f4_array[i] = a;
	}
}

/* output basis */
void f4_output()
{
	f4row *a, *b;
	INT32 i, j, k, l, m, x, y;
	m = f4_mon_one();
	if (f4_asize < f4_bload) {
		f4_asize = f4_bload;
		f4_array = realloc(f4_array, f4_asize*sizeof(f4row *));
	}
	f4_aload = f4_bload;
	for (i=0; i < f4_bload; i++) {
		b = f4_basis[i];
		a = f4row_new();
		a->len = b->len;
		a->fac = m;
		a->cof = b->cof;
		a->mon = b->mon;
		a->ind = 0;
		f4_array[i] = a;
	}

	k = l = f4_aload;
	if (k < 2) goto done;
sort:	k = 5*(k+1)/13;
	/* sort remaining rows */
	for (i=k-1; i < l; i++) {
		a = f4_array[i];
		x = a->mon[0];
		for (j=i; j >= k; j-=k) {
			b = f4_array[j-k];
			y = b->mon[0];
			if (f4_mon_cmp(x,y) >= 0) break;
			f4_array[j] = b;
		}
		f4_array[j] = a;
	}
	if (k > 1) goto sort;
done:	return ;
}

/* cull basis */
void f4_remove()
{
	f4row *a, *b;
	INT32 i, j;
	for (i=0; i < f4_aload; i++) {
		b = f4_array[i];
		if (!b) continue;
		for (j=0; j < i; j++) {
			a = f4_array[j];
			if (!a) continue;
			if (f4_mon_div(b->mon[0],a->mon[0])) break;
		}
		if (j < i) f4_array[i] = 0;
	}
	for (i=j=0; i < f4_aload; i++) {
		if (!f4_array[i]) continue;
		f4_array[j++] = f4_array[i];
	}
	f4_aload = j;
}

/* syzygy pairs */
void f4_sort_pairs()
{
	f4syz *b, *c;
	INT32 i, j, k, lb, lc;
	/* sort pairs */
	k = f4_pload;
	if (k < 2) return;
sort:	k = 5*(k+1)/13;
	for (i=k-1; i < f4_pload; i++) {
		b = f4_pairs[i];
		for (j=i; j >= k; j-=k) {
			c = f4_pairs[j-k];
			if (f4_mon_cmp(b->lcm,c->lcm) > 0) break;
			f4_pairs[j] = f4_pairs[j-k];
		}
		f4_pairs[j] = b;
	}
	if (k > 1) goto sort;
}

/* put in array */
void f4_addrow(f4row *a)
{
	if (f4_aload + 1 > f4_asize) {
		f4_asize = 3*f4_asize/2;
		f4_array = realloc(f4_array, f4_asize*sizeof(f4row *));
	}
	f4_array[f4_aload] = a;
	f4_aload++;
}

void f4mod_init(INT32 n, INT32 e, INT p)
{
	f4_dprev = 0;
	f4_limit = 2048;
	f4_prime = p;
	f4_mon_init(n,e);
	f4_aload = f4_bload = f4_pload = 0;
	f4_asize = f4_bsize = f4_psize = 30;
	f4_array = malloc(f4_asize*sizeof(f4row *));
	f4_basis = malloc(f4_bsize*sizeof(f4row *));
	f4_pairs = malloc(f4_psize*sizeof(f4syz *));
}

void f4mod_free()
{
	free(f4_array);	free(f4_basis); free(f4_pairs);
	f4_aload = f4_bload = f4_pload = 0;
	f4_asize = f4_bsize = f4_psize = 0;
	cmalloc_free();
	f4_mon_free();
	f4_prime = 0;
}

/* F4 mod p */
void f4gb_mod()
{
	f4row *b, *c;
	INT32 d, e, i, j, k, n, step;
	INT p;
	double tt, t0, t1;

	n = f4_nvars;
	e = f4_nelim;
	p = f4_prime;
	tt = realtime();

	printf("F4 eliminate %lld/%lld variables mod p=%lld, %lld threads\n", (long long int)e, (long long int)n, (long long int)p, (long long int)numcpu());

	for (i=0; i < f4_aload; i++) {
		b = f4_array[i];
		b = f4_reduce_monic(b);
		f4_update(b);
	}
	step = 0;
	while (f4_pload > 0) {
		t0 = realtime();

		step++;
		printf("\nSTEP %lld\n", (long long int)step);

		f4_sort_pairs();
		f4_memory();	/* free up memory for alg */
		f4_select();	/* select pairs to reduce */
		f4_symbol();	/* symbolic preprocessing */
		f4_encode();	/* encode polys in matrix */
		f4_reduce();	/* fast Gauss elimination */
		f4_filter();	/* select new polynomials */
		f4_jordan();	/* fast back substitution */
		f4_decode();	/* convert matrix to poly */
		for (i=0; i < f4_aload; i++) {
			b = f4_array[i];
			f4_update(b);
		}
		t1 = realtime();
		printf("new=%lld basis=%lld, step time=%.3f sec\n", (long long int)f4_aload, (long long int)f4_bload, t1-t0);
	}
	printf("\nDONE inter-reduce\n");

	f4_output();
	f4_remove();
	f4_symbol();
	f4_encode();
	f4_jordan();
	f4_decode();
	f4_remove();

	tt = realtime()-tt;
	printf("%lld basis elements, %.3f sec\n", (long long int)f4_aload, tt);
}


/*
	import polynomials
*/

#define MAXVARS 1024
char * vars[MAXVARS] = {0};

/* get integer */
INT getint(char *s, int *l)
{
	int i, j;
	INT c = 0;
	for (i=0; s[i]; i++) {
		if ('0' <= s[i] && s[i] <= '9') {
			c = 10*c + (s[i] - '0');
		}
		else break;
	}
	*l = i; return c;
}

/* get variable */
char * getvar(char *s, int *l)
{
	char *v;
	int i, j;
	for (i=0; s[i]; i++) {
		/* variables begin with a letter */
		/* or underscore but can include */
		/* digits after the first letter */
		if (('_' <= s[i] && s[i] <= 'z')
		 || ('A' <= s[i] && s[i] <= 'Z')
		 || ('0' <= s[i] && s[i] <= '9' && i > 0)
		) continue; else break;
	}
	if (i==0) return 0;
	v = malloc((i+1)*sizeof(char));
	for (j=0; j < i; j++) v[j] = s[j]; v[j] = 0;
	*l = i; return v;
}

/* import one term of polynomial */
INT32 getmon(char *s, INT *c, int *l)
{
	INT32 z[MAXVARS] = {0};
	INT b, p, t;
	INT32 m, n, e;
	int i, j, k;
	char *v;

	n = f4_nvars;
	p = f4_prime; 
	*c = 1; i = 0;
next:	switch (s[i]) {
	case '+':	if (i > 0) goto done;
			i++; t=+1; break;
	case '-':	if (i > 0) goto done;
			i++; t=-1; break;
	case '*':	i++; break;
	case '/':	goto fail;
	case '\n':	goto done;
	}

	/* coefficient? */
	b = getint(s+i,&j);
	if (j > 0 && t==+1) *c = b % p;
	if (j > 0 && t==-1) *c = (p-b) % p;
	i += j; if (j) goto next;

	/* var^exponent */
	v = getvar(s+i,&j);
	if (!v) goto fail;
	i += j; e = 1;
	if (s[i] == '^') {
		i++;
		e = (INT32)getint(s+i,&j);
		if (j==0) goto fail;
		i += j;
	}

	/* put exponent in z */
	for (k=0; k < n; k++) {
		if (strcmp(v,vars[k])) continue;
		z[k] = e; break;
	}
	free(v);
	if (k==n) goto fail;
	goto next;

done:	*l = i;
	return f4_mon_new(z);

fail:	printf("error: can't parse term\n");
	return 0;
}

/* import expanded polynomial */
f4row * getpol(char *s, int *l)
{
	f4row *b;
	INT c;
	INT32 m;
	int i, j, k;

	/* count the number of terms */
	for (k=1, i=0; s[i] != '\n'; i++) {
		if (s[i] == '+' || s[i] == '-') k++;
	}

	b = f4row_new();
	b->len = b->fac = 0;
	b->cof = malloc(k*sizeof(INT));
	b->mon = malloc(k*sizeof(INT32));
	b->ind = 0;

	for (i=k=0; s[i] != '\n'; i+=j) {
		if (s[i]==0) return 0;
		m = getmon(s+i,&c,&j);
		if (m==0) break;
		if (c==0) continue;
		b->cof[k] = c;
		b->mon[k] = m;
		b->len =  ++k;
	}
	f4row_sort(b);

	*l = i;
	return b;
}

/* write polynomial to file */
void putpol(f4row *b, FILE *out)
{
	INT32 *e, i, j, m, n;
	INT c;
	n = f4_nvars;
	for (i=0; i < b->len; i++) {
		c = b->cof[i];
		m = b->mon[i];
		e = EXPVEC(m);
		fprintf(out,"%+lld",(long long int)c);
		for (j=0; j < n; j++) {
			if (e[j] == 0) continue;
			if (e[j] == 1) fprintf(out,"*%s",vars[j]);
			if (e[j] >= 2) fprintf(out,"*%s^%lld",vars[j],(long long int)e[j]);
		}
	}
}

#undef NORMAL
#undef EXPVEC
#undef LEXCOL

int main(int argc, char **argv)
{
	FILE *F = 0;
	char *f = 0;
	char *g = 0;
	char *s = 0;
	int i, j, k, l;
	long long int p, n, e, t;
	f4row *b;

	setbuf(stdout,0);
	p = n = e = t = 0;
	for (i=1; i < argc; i++) {
		if (!strcmp("-t", argv[i]) && i+1 < argc) {
			sscanf(argv[i+1], "%llu", &t);
			i++;
		}
		else if (!strcmp("-e", argv[i]) && i+1 < argc) {
			sscanf(argv[i+1], "%llu", &e);
			i++;
		}
		else if (!strcmp("-p", argv[i]) && i+1 < argc) {
			sscanf(argv[i+1], "%llu", &p);
			i++;
		}
		else if (!strcmp("-v", argv[i]) && i+1 < argc) {
			s = argv[i+1];
			for (j=k=0; s[j]; j++) {
				if (s[j] == '[') continue;
				if (s[j] == ',') continue;
				if (s[j] == ']') continue;
				vars[k] = getvar(s+j, &l);
				if (!vars[k]) printf("error: can not parse variables from %s\n",s+j);
				if (!vars[k]) return 0;
				j += l-1; k++;
			}
			n = k;
			i++;
		}
		else {
			f = argv[i];
		}
	}
	if (t > 0) num_thread = (int)t;

	/* check necessary arguments */
	if (p==0) printf("error: no prime specified, use -p PRIME\n");
	if (n==0) printf("error: no variables found, use -v [x1,x2,...]\n");
	if (f==0) printf("error: need a filename for input polynomials\n");
	if (!p || !n || !f) return 0;

	/* initialize */
	f4mod_init(n,e,p);

	F = fopen(f,"r");
	if (!F) printf("error: file not found\n");
	if (!F) return 0;

	/* read input file into memory */
	k = fgetc(F); for (i=1; k != EOF; i++) k = fgetc(F);
	s = malloc(i*sizeof(char));
	if (!s) printf("error: file too large?\n");
	if (!s) return 0;

	rewind(F);
	k = fgetc(F);
	for (i=j=0; k != EOF; i++) {
		s[i] = (char)k;
		if (s[i]=='\r') i--;
		k = fgetc(F);
	}
	s[i] = 0;
	fclose(F);

	/* get polynomials */
	for (i=0; s[i]; i++) {
		b = getpol(s+i,&j);
		if (!b) printf("error: can not parse polynomial on line %lld\n", (long long int)f4_aload+1);
		if (!b) return 0;

		f4_addrow(b);
		i += j;
	}
	free(s);

	f4gb_mod();

	g = malloc(strlen(f)+5);
	for (i=0; f[i]; i++) g[i] = f[i];
	g[i++] = '.';
	g[i++] = 'o';
	g[i++] = 'u';
	g[i++] = 't';
	g[i++] = 0;

	F = fopen(g,"w");
	for (i=0; i < f4_aload; i++) {
		b = f4_array[i];
		putpol(b,F);
		putc('\n',F);
	}
	fclose(F);

	f4mod_free();
	return 0;
}
