#  Demonstration code for Scientific Computing class,
#  http://www.math.nyu.edu/faculty/goodman/teaching/SciComp2024/
#  PerformanceDemo.py
#  uses Python 3 and Numpy
#  Matrix multiplication demo

#  Observe how the performance of different ways to do the same
#  computation in Python

import numpy        as     np
import numpy.linalg as     la
import time

""" A code for a live demo of Python code performance doing the same
    task on matrices coded in various ways.
    
    First, the matrix product C=AB done in three ways.
       -- Scalar loops, directly coding the matrix multiplication
          formulas.
       -- Vectorized loops, using the numpy integrated dot product
          to find the dot product of a row of A with a column of B
          This has the same memory access pattern as the scalar loop
          method, but avoids repeatedly interpreting the command that
          does the multiplication.
       -- Using the integrated @ (matrix multiplication symbol), which
          calls LApack to do the matrix multiply.
          This does block matrix multiplication with a block size
          optimized for the cache structure of the processor.
          
    Second, calculate the sum of the elements of a matrix.  This is a
          double loop.  In both cases the inner loop is vectorized.
          The difference is only the memory access pattern
       -- Column first: access in the order
             A_{11}, A_{21}, ... , A{n1}, A_{12}, A_{22}, ...
       -- Row first: access in the order
             A_{11}, A_{12}, ... , A{1n}, A_{21}, A_{22}, ...
             
    The code is meant to be run as a live demo so you experience first
    hand the performance differences.
    Type: [python command] PerformanceDemo.py and watch
"""

#-------------------------------------------------------------------
#-------------- Matrix multiplication ------------------------------
#-------------------------------------------------------------------

nsl = [100, 200, 300]   # n values for scalar loop matrix tests

# --------------- Scalar loop -------------------------------------

print("\n matrix multiplication, scalar loops")
for n in nsl:
    A = np.zeros([n,n])     # The zero matrix is good enough for the demo
    B = np.zeros([n,n])
    C = np.zeros([n,n])     # will be the matrix product AB
    
    outLine = "   n =  {n:3d}".format(n=n)
    print(outLine + ", starting ... ", end="", flush = True)
    t_start = time.time()
  
#        Scalar loop matrix multiplication

    for i in range(n):
       for j in range(n):
          sum = 0.
          for k in range(n):
             sum = sum + A[i,k]*B[k,j]
          C[i,j] = sum
          
    t_end = time.time()
    t_op  = t_end - t_start
    outLine = "done, time = {t:7.2f} sec.".format(t = t_op)
    print(outLine)
    
# --------------- Vectorized inner loop -------------------------------------

print("\n matrix multiplication, vectorized inner loop")
nvl = [500, 1000, 1500]   # n values for vectorized loop tests
for n in nvl:
    A = np.zeros([n,n])     # The zero matrix is good enough for the demo
    B = np.zeros([n,n])
    C = np.zeros([n,n])     # will be the matrix product AB
    
    outLine = "   n =  {n:5d}".format(n=n)
    print( outLine + ", starting ... ", end="", flush = True)
    t_start = time.time()
    
 #    Matrix multiplication with the innermost loop vectorized
 
    for i in range(n):
       for j in range(n):
          C[i,j] = np.dot(A[i,:], B[:,j])
          
    t_end = time.time()
    t_op  = t_end - t_start
    outLine = "done, time = {t:7.2f} sec.".format(t = t_op)
    print(outLine)
    
# --------------- Matrix multiplication using LApack ---------------------
          
print("\n matrix multiplication, LApack")
nla = [5000, 10000, 15000]   # n values for LApack tests
for n in nla:
    A = np.zeros([n,n])     # The zero matrix is good enough for the demo
    B = np.zeros([n,n])
    
    outLine = "   n =  {n:5d}".format(n=n)
    print( outLine + ", starting ... ", end="", flush = True)
    t_start = time.time()

#    Matrix multiplication using the LApack routine

    C = A@B
    
    t_end = time.time()
    t_op  = t_end - t_start
    outLine = "done, time = {t:7.2f} sec.".format(t = t_op)
    print(outLine)
    
#-------------------------------------------------------------------
#-------------- Matrix summation ----------------------------------
#-------------------------------------------------------------------
    
# --------------- matrix sum, column-wise ---------------------

ncs = [10000, 20000, 40000]    # matrix sizes for column first tests

print("\n column-wise summation of matrix elements")

for n in ncs:
   A   = np.zeros([n,n])
   sum = 0.               # will be sum_{j,k} A_{jk}
   
   outLine = "   n =  {n:5d}".format(n=n)
   print( outLine + ", starting ... ", end="", flush = True)
   t_start = time.time()

#        Sum the elements of A, with columns in the inner loop

   for k in range(n):
      sum = sum + np.sum(A[:,k])
      
   t_end = time.time()
   t_op  = t_end - t_start
   outLine = "done, time = {t:7.2f} sec.".format(t = t_op)
   print(outLine)
      
# --------------- matrix sum, row-wise ---------------------
    
nrs = [10000, 20000, 40000]      # matrix sizes for row first tests

print("\n row-wise summation of matrix elements")

for n in nrs:
   A = np.zeros([n,n])
   sum = 0.               # will be sum_{j,k} A_{jk}
   
   outLine = "   n =  {n:5d}".format(n=n)
   print( outLine + ", starting ... ", end="", flush = True)
   t_start = time.time()

#        Sum the elements of A, with rows in the inner loop

   for k in range(n):
      sum = sum + np.sum(A[k,:])
      
   t_end = time.time()
   t_op  = t_end - t_start
   outLine = "done, time = {t:7.2f} sec.".format(t = t_op)
   print(outLine)