Mark Bishop

Analytical Chemistry and

Signal Acquisition/Analysis



/**
 * Copyright 2013 Mark Bishop
 * Adapted from sample code: Copyright 2011 Marco Hutter - http://www.jcuda.org: 
 * 
 *  This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version. 
 
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details: http://www.gnu.org/licenses
 
    The author makes no warranty for the accuracy, completeness, safety, or usefulness
    of any information provided and does not represent that its use would not infringe
    privately owned right.
 */
 
import static jcuda.jcublas.JCublas.*;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import jcuda.*;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaMemcpyKind;
 
public class GpuLU {
 
	/**
	 * Invert A[][]
	 * @param A A(m,m)
	 * @return A(m,m) Inverse
	 */
	public static float[][] Invert(float[][] A) {
		int m = A.length;
		float[] a1D = Utilities.To1D(A);
		a1D = Invert(m, a1D);
		float[][] aInv = Utilities.To2D(a1D, m, m);
		return aInv;
	}
 
	/**
	 * Invert A[]
	 * @param m row count = column count
	 * @param A(m*m)
	 * @return A(m,m) Inverse
	 */
	public static float[] Invert(int m, float A[]) {
		cublasInit();
		Pointer dA = new Pointer();
		cublasAlloc(m * m, Sizeof.FLOAT, dA);
		cublasSetMatrix(m, m, Sizeof.FLOAT, Pointer.to(A), m, dA, m);
 
		invertMatrix(m, dA);
 
		cublasGetMatrix(m, m, Sizeof.FLOAT, dA, m, Pointer.to(A), m);
		cublasFree(dA);
		cublasShutdown();
		return A;
	}
 
	/**
	 * Get P and (encoded) LU for A(m=n,n): such that PA = LU
	 * @param A A(m=n,n)
	 * @return LU[0] = row permutation matrix, LU[1] = encoded LU
	 * 
	 */
	public static float[][][] LU(float A[][]) {
		cublasInit();
 
		int m = A.length;
		int n = A[0].length;
		float[] A1D = Utilities.To1D(A);
		float[] LU = new float[2 * m * n];
 
		Pointer dA = new Pointer();
 
		cublasAlloc(m * n, Sizeof.FLOAT, dA);
		cublasSetMatrix(m, n, Sizeof.FLOAT, Pointer.to(A1D), m, dA, n);
 
		int[] pivot = cudaSgetrfSquare(n, dA);
 
		JCuda.cudaMemcpy(Pointer.to(LU), dA, m * n * Sizeof.FLOAT,
				cudaMemcpyKind.cudaMemcpyDeviceToHost);
 
		cublasFree(dA);
		cublasShutdown();
 
		cublasInit();
 
		float[][] P = CpuMatrixOps.Eye(n);
 
		float[] P1D = Utilities.To1D(P);
 
		Pointer dP = new Pointer();
 
		cublasAlloc(m * n, Sizeof.FLOAT, dP);
		cublasSetMatrix(m, n, Sizeof.FLOAT, Pointer.to(P1D), m, dP, n);
 
		// Perform inversion on factorized matrix
				cudaSgetri(n, dP, pivot);
 
		JCuda.cudaMemcpy(Pointer.to(P1D), dP, m * n * Sizeof.FLOAT,
				cudaMemcpyKind.cudaMemcpyDeviceToHost);
 
		P = CpuMatrixOps.Transpose(Utilities.To2D(P1D, m, n));
		float[][] ALU = CpuMatrixOps.Transpose(Utilities.To2D(LU, m, n));
		float[][][] result = new float[2][][];
		result[0] = P;
		result[1] = ALU;
 
		cublasFree(dP);
		cublasShutdown();
		return result;
	}
 
	/**
	 * Invert the A(n,m=n) that is given in device memory.
	 * @param n row count = column count
	 * @param dA Pointer A
	 */
	private static void invertMatrix(int n, Pointer dA) {
		// Perform LU factorization
		int[] pivots = cudaSgetrfSquare(n, dA);
		// Perform inversion on factorized matrix
		cudaSgetri(n, dA, pivots);
	}
 
	/**
	 * Convenience method that returns a pointer with the given offset (in
	 * number of 4-byte float elements) from the given pointer.
	 * 
	 * @param p pointer
	 * @param floatOffset offset
	 * @return The new pointer
	 */
	private static Pointer at(Pointer p, int floatOffset) {
		return p.withByteOffset(floatOffset * Sizeof.FLOAT);
	}
 
	/**
	 * cudaSgetrf performs an in-place LU factorization on a square matrix. Uses
	 * the unblocked BLAS2 approach
	 * 
	 * @param n row count = column count
	 * @param dA pointer to the matrix (in device memory)
	 * @return The permutation pivots
	 */
	private static int[] cudaSgetrfSquare(int n, Pointer dA) {
		int[] pivots = new int[n];
		for (int i = 0; i < n; i++) {
			pivots[i] = i;
		}
 
		float[] factor = { 0.0f };
		Pointer pFactor = Pointer.to(factor);
		for (int i = 0; i < n - 1; i++) {
			Pointer offset = at(dA, i * n + i);
 
			int pivot = i - 1 + cublasIsamax(n - i, offset, 1);
			if (pivot != i) {
				pivots[i] = pivot;
				cublasSswap(n, at(dA, pivot), n, at(dA, i), n);
			}
 
			cublasGetVector(1, Sizeof.FLOAT, offset, 1, pFactor, 1);
			cublasSscal(n - i - 1, 1 / factor[0], at(offset, 1), 1);
			cublasSger(n - i - 1, n - i - 1, -1.0f, at(offset, 1), 1,
					at(offset, n), n, at(offset, n + 1), n);
		}
		return pivots;
	}
 
	/***
	 * cudaSgetri Computes the inverse of an LU-factorized square matrix
	 * @param n row count = column count
	 * @param dA The matrix in device memory
	 * @param pivots The permutation pivots
	 */
	private static void cudaSgetri(int n, Pointer dA, int[] pivots) {
		cudaStrtri(n, dA);
 
		// Solve inv(A)*L = inv(U)
		Pointer dWork = new Pointer();
		cublasAlloc(n - 1, Sizeof.FLOAT, dWork);
 
		for (int i = n - 1; i > 0; i--) {
			Pointer offset = at(dA, ((i - 1) * n + i));
			cudaMemcpy(dWork, offset, (n - 1) * Sizeof.FLOAT,
					cudaMemcpyDeviceToDevice);
			cublasSscal(n - i, 0, offset, 1);
			cublasSgemv('n', n, n - i, -1.0f, at(dA, i * n), n, dWork, 1, 1.0f,
					at(dA, ((i - 1) * n)), 1);
		}
 
		cublasFree(dWork);
 
		// Pivot back to original order
		for (int i = n - 1; i >= 0; i--) {
			if (i != pivots[i]) {
				cublasSswap(n, at(dA, i * n), 1, at(dA, pivots[i] * n), 1);
			}
		}
	}
 
	/***
	 * cudaStrtri Computes the inverse of an upper triangular matrix in place
	 * Uses the unblocked BLAS2 approach
	 * @param n row count = column count
	 * @param dA  The matrix
	 */
	private static void cudaStrtri(int n, Pointer dA) {
		float[] factor = { 0.0f };
		Pointer pFactor = Pointer.to(factor);
		for (int i = 0; i < n; i++) {
			Pointer offset = at(dA, i * n);
			cublasGetVector(1, Sizeof.FLOAT, at(offset, i), 1, pFactor, 1);
			factor[0] = 1 / factor[0];
			cublasSetVector(1, Sizeof.FLOAT, pFactor, 1, at(offset, i), 1);
			cublasStrmv('u', 'n', 'n', i, dA, n, offset, 1);
			cublasSscal(i, -factor[0], offset, 1);
		}
	}
 
}
 
 
ID may be exposed