\ parallel-mm.4th
\
\ Very simple example of matrix multiplication by parallel processing 
\ on a multi-core machine using the syscall, fork.
\
\ Krishna Myneni, 2010-04-17
\
\ Multiply a 2xn matrix A with a nx1 vector B to find the 2x1 result.
\ Use one process to multiply the first row of A with B, and use a 
\ child process to multiply the second row of A with B. On a multi-core 
\ CPU, the child process should be forked by the OS to use a different 
\ core.
\
\ Notes:
\
\  0) The syscall, "fork", is implemented in syscalls386.4th. Syscalls
\     may be implemented by any Forth, on Unix-like systems, which 
\     provides an assembler.
\
\  1) When the number of elements, NELEM, is small, the single process method
\     is faster, presumably because of the overhead associated with forking.
\     However, as NELEM is increased, the concurrently running parent and child
\     processes should give a factor of two speed-up for the parallel method
\     over the serial method. This behavior is observed.
\
\  2) This example does not implement an interprocess communication to allow
\     the child process to communicate its result back to the parent, which
\     would be necessary for a useful computation.
\
\  3) It's unclear to me, currently, if using BYE in the child is 
\     the appropriate way to terminate the child process.
\
\ Revisions:
\   2010-04-17  km  use syscall waitpid to wait for child to terminate
\                   before measuring elapsed time; renamed syscalls386.4th
\                   to syscalls.4th so INCLUDE statement changed accordingly

include ans-words
include syscalls
include fsl/fsl-util
include fsl/horner
include fsl/extras/noise

variable cpid        \ child process id
variable status

1000000 constant NELEM
2 NELEM FLOAT MATRIX A{{
NELEM FLOAT ARRAY B{

: mul-row1 ( -- r )
        0e  NELEM 0 DO  A{{ 0 I }} F@ B{ I } F@ F* F+  LOOP ;

: mul-row2 ( -- r )
	0e  NELEM 0 DO  A{{ 1 I }} F@ B{ I } F@ F* F+  LOOP ;


: init-matrices ( -- )
	NELEM 0 DO  2 0 DO  ran0 A{{ I J }} F!  LOOP  LOOP
	NELEM 0 DO  ran0  B{ I } F!  LOOP ;

: parallel ( -- )
     \ ms@ cr ." Start of parent: " .
     fork  dup cpid !
     0< ABORT" Unable to fork!"  
     cpid @ 0= IF
	\ child  handles multiplication of row 2 of A{{  
        \ ms@  cr ." Start of child: " . 
        mul-row2 
        cr ." Row 2: " f. 
        \ cr ." End of child: " ms@ . cr
        bye    
     ELSE
 	\ parent handles multiplication of row 1 of A{{
	mul-row1
        cr ." Row 1: " f.
        \ cr ." End of parent: " ms@ .  
     THEN
;

\ Use a single process to perform the multiplication one row at a time
: serial ( -- ) 
     mul-row1 cr ." Row 1: " f.
     mul-row2 cr ." Row 2: " f. 
;

cr .( Initializing the matrices ... )
init-matrices
cr .( The matrix A is 2 x ) NELEM . 

cr cr .( Executing the serial algorithm  )
ms@ serial ms@ swap -
cr .( Elapsed [ms]: ) .

cr cr .( Executing the parallel algorithm  )
ms@ parallel
\ parent has finished; now, wait for the child to terminate
cpid @ status 0 waitpid cpid @ <> 
[IF] 
cr .( Child process did not terminate properly! )
[ELSE]  
ms@ swap - 
cr .( Elapsed [ms]: ) . 
[THEN]

