/********************************************************************************
   lbench     Linux benchmark program

   Copyright 2007-2019 Michael Cornelison
   source code URL: https://kornelix.net
   contact: kornelix@posteo.de

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version. See https://www.gnu.org/licenses

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  
   See the GNU General Public License for more details.

*********************************************************************************/

#include "zfuncs.h"

#define gtitle "lbench 3.7"                                                      //  version

//  parameters and default values 
int      Nthreads = 1;                                                           //  parallel threads
int      runtime = 1;                                                            //  benchmark run time
int      memKB[5] = { 1, 10, 100, 1000, 10000 };                                 //  memory bench region size, KB
int      fiboN = 44;                                                             //  fibonacci bench
int      diskMB = 100;                                                           //  disk I/O bench file size, MB
int      diskKB[4] = { 1, 10, 100, 1000 };                                       //  disk I/O bench record size, KB
int      memMB = 500;                                                            //  memory test size, MB

#define  Nparms 14                                                               //  no. of parameters

const char  *parmdesc[Nparms] =  {
      "parallel execution threads",                                              //  parameter descriptions
      "benchmark run time, secs",
      "memory benchmark region 0, KB",
      "memory benchmark region 1, KB",
      "memory benchmark region 2, KB",
      "memory benchmark region 3, KB",
      "memory benchmark region 4, KB",
      "Fibonacci benchmark number",
      "disk I/O benchmark file size, MB",
      "disk I/O benchmark record size 0, KB",
      "disk I/O benchmark record size 1, KB",
      "disk I/O benchmark record size 2, KB",
      "disk I/O benchmark record size 3, KB",
      "memory test region size, MB"   };

typedef struct {
   const char  *name;                                                            //  all parameter data in a table
   int         lolim, hilim;
   int         *parm;
   const char  *parmdesc;
}  parmtab_t;

parmtab_t      parmtab[Nparms] =  {
/*        parm-name     low     high       variable      description       */
      {  "threads",     1,      9,         &Nthreads,    parmdesc[0]   },
      {  "runtime",     1,      99,        &runtime,     parmdesc[1]   },
      {  "mem0KB",      1,      100000,    &memKB[0],    parmdesc[2]   },
      {  "mem1KB",      1,      100000,    &memKB[1],    parmdesc[3]   },
      {  "mem2KB",      1,      100000,    &memKB[2],    parmdesc[4]   },
      {  "mem3KB",      1,      100000,    &memKB[3],    parmdesc[5]   },
      {  "mem4KB",      1,      100000,    &memKB[4],    parmdesc[6]   },
      {  "fiboN",       20,     50,        &fiboN,       parmdesc[7]   },
      {  "diskMB",      1,      1000,      &diskMB,      parmdesc[8]   },
      {  "disk0KB",     1,      1000,      &diskKB[0],   parmdesc[9]   },        //  1.4
      {  "disk1KB",     1,      1000,      &diskKB[1],   parmdesc[10]  },
      {  "disk2KB",     1,      1000,      &diskKB[2],   parmdesc[11]  },
      {  "disk3KB",     1,      1000,      &diskKB[3],   parmdesc[12]  },
      {  "memMB",       100,    8191,      &memMB,       parmdesc[13]  }  };

char     scratchfile[100] = "/tmp/lbench-scratch";                               //  disk I/O bench scratch file
int      NTbusy = 0;                                                             //  benchmark threads active
int      killThreads = 0;                                                        //  tells threads to exit
int      tid[10] = { 0,1,2,3,4,5,6,7,8,9 };                                      //  10 thread IDs

int  initfunc(void *);                                                           //  GTK initial function
int  timer_func(void *);                                                         //  timer function, status monitor
void wprintz(cchar *mess, int bold);                                             //  print to log window from a thread
void load_parms();                                                               //  load parameters from file
void save_parms();                                                               //  save parameters to file
void rampCPU();                                                                  //  ramp-up CPU clock speed
void menufunc(GtkWidget *item, const char *menu);                                //  menu and toolbar functions

void m_parms();                                                                  //  toolbar functions
void m_list();
void m_clear();
void m_kill();
void m_quit();
void m_help();
void gate_threads_enter();
void gate_threads_leave();

void m_all();                                                                    //  menu benchmark functions
void * all_thread(void *);
void m_cpu();
void * cpu_thread(void *);
void m_memspeed();
void * mem_thread(void *);
void m_funccall();                                                               //  3.2
void * funccall_thread(void *);
void m_matrix();                                                                 //  3.4
void * matrix_thread(void *);
void m_smp1();
void * smp1_thread(void *);
void m_smp2();
void * smp2_thread1(void *);
void * smp2_thread2(void *);
void m_smp3();
void * smp3_thread(void *);
void m_global_lock();                                                            //  3.7
void * global_lock_thread(void *);
void m_fibo();
void * fibo_thread(void *);
void m_whetstone();
void * whetstone_thread(void *);
void m_linpack();
void * linpack_thread(void *);
void m_disk();
void * disk_thread(void *);
void m_rpm();
void * rpm_thread(void *);
void m_memtest();
void * memtest_thread(void *);


/********************************************************************************/

//  main windowing program

GtkWidget      *mWin, *mVbox, *mScroll, *mLog;                                   //  main window widgets
GtkWidget      *benchmenu;
GtkTextBuffer  *textBuff;

int main(int argc, char *argv[])
{
   GtkWidget   *tbar;

   appimage_install("lbench");                                                   //  if appimage, menu integration      3.6

   if (argc > 1 && strmatch(argv[1],"-uninstall"))                               //  uninstall appimage                 3.6
      appimage_unstall();                                                        //  (does not return if uninstalled)

   setenv("GDK_BACKEND","x11",1);                                                //  fedora/wayland
   setenv("GTK_THEME","default",0);                                              //  KDE window manager

   gtk_init(&argc,&argv);                                                        //  initz. GTK

   zinitapp("lbench");                                                           //  set up app directory

   mWin = gtk_window_new(GTK_WINDOW_TOPLEVEL);                                   //  main window
   gtk_window_set_title(GTK_WINDOW(mWin),gtitle);
   gtk_window_set_position(GTK_WINDOW(mWin),GTK_WIN_POS_CENTER);
   gtk_window_set_default_size(GTK_WINDOW(mWin),700,600);
   
   mVbox = gtk_box_new(VERTICAL,0);                                              //  vertical packing box
   gtk_container_add(GTK_CONTAINER(mWin),mVbox);                                 //  add to main window

   tbar = create_toolbar(mVbox,32);                                              //  create tool bar and buttons
   add_toolbar_button(tbar,"bench","choose benchmark","lbench.png",menufunc);
   add_toolbar_button(tbar,"clear","clear window","clear.png",menufunc);
   add_toolbar_button(tbar,"kill","kill running function","stop.png",menufunc);
   add_toolbar_button(tbar,"parms","edit parameters","parms.png",menufunc);
   add_toolbar_button(tbar,"list","list parameters","list.png",menufunc);
   add_toolbar_button(tbar,"quit","quit lbench","quit.png",menufunc);
   add_toolbar_button(tbar,"help","show user guide","help.png",menufunc);
   
   benchmenu = create_popmenu();                                                       //  2.4
   add_popmenu_item(benchmenu,"all",menufunc,0,"run all benchmarks");
   add_popmenu_item(benchmenu,"cpu",menufunc,0,"CPU performance");
   add_popmenu_item(benchmenu,"mem speed",menufunc,0,"memory performance");
   add_popmenu_item(benchmenu,"func call",menufunc,0,"function call rate");            //  3.2
   add_popmenu_item(benchmenu,"matrix math",menufunc,0,"matrix math rate");            //  3.4
   add_popmenu_item(benchmenu,"smp1",menufunc,0,"thread switch rate");
   add_popmenu_item(benchmenu,"smp2",menufunc,0,"thread creation rate");
   add_popmenu_item(benchmenu,"smp3",menufunc,0,"process creation rate");        
   add_popmenu_item(benchmenu,"global lock",menufunc,0,"global lock/unlock rate");     //  3.7
   add_popmenu_item(benchmenu,"fibo",menufunc,0,"fibonacci compute time");
   add_popmenu_item(benchmenu,"whetstone",menufunc,0,"Whetstone benchmark");
   add_popmenu_item(benchmenu,"linpack",menufunc,0,"Linpack benchmark");
   add_popmenu_item(benchmenu,"disk",menufunc,0,"disk I/O throughput");
   add_popmenu_item(benchmenu,"rpm",menufunc,0,"CPU temperature and throttling");
   add_popmenu_item(benchmenu,"mem test",menufunc,0,"memory test and burn-in");

   mScroll = gtk_scrolled_window_new(0,0);                                       //  scrolled window
   gtk_box_pack_start(GTK_BOX(mVbox),mScroll,1,1,0);                             //  add to main window mVbox
   mLog = gtk_text_view_new();                                                   //  text window
   gtk_text_view_set_left_margin(GTK_TEXT_VIEW(mLog),2);
   gtk_container_add(GTK_CONTAINER(mScroll),mLog);                               //  add to scrolled window
   textBuff = gtk_text_view_get_buffer(GTK_TEXT_VIEW(mLog));                     //  get related text buffer

   G_SIGNAL(mWin,"destroy",m_quit,0);                                            //  connect signals to main window
   G_SIGNAL(mWin,"delete-event",m_quit,0);

   g_timeout_add(0,initfunc,0);                                                  //  setup initial call from gtk_main()
   gtk_widget_show_all(mWin);                                                    //  show all widgets
   gtk_main();                                                                   //  process events
   return 0;
}


//  initial function called from gtk_main() at startup

int initfunc(void *varg)
{
   load_parms();                                                                 //  get saved parameters
   g_timeout_add(100,timer_func,0);                                              //  start periodic function (100 ms)
   return 0;
}


//  timer function to monitor status and print thread outputs

int      logP1 = 0;                                                              //  last log message filled
int      logP2 = 0;                                                              //  last log message emptied
int      logbold[10];                                                            //  bold print flag for 10 log messages
char     logmess[10][100];                                                       //  up to 10 pending log messages

int timer_func(void *)
{
   while (logP1 != logP2)                                                        //  thread output is available
   {
      logP2++;
      if (logP2 > 9) logP2 = 0;                                                  //  next message to output
      if (logbold[logP2])
         textwidget_append2(mLog,1,logmess[logP2]);
      else 
         textwidget_append2(mLog,0,logmess[logP2]);
      *logmess[logP2] = 0;                                                       //  mark message slot available
   }

   return 1;                                                                     //  keep timer running
}


//  function for threads to use to print to log window

void wprintz(cchar *mess, int bold)
{
   int      pp;
   
   while (true) 
   {
      gate_threads_enter();
      pp = logP1 + 1;                                                            //  next message slot to fill
      if (pp > 9) pp = 0;
      if (*logmess[pp] == 0) break;
      gate_threads_leave();                                                      //  slot still waiting to output
      zsleep(0.1);                                                               //  try again later
   }

   logbold[pp] = bold;
   strncpy0(logmess[pp],mess,100);
   logP1 = pp;
   gate_threads_leave();
   return;
}


//  load parameters from prior session

void load_parms()
{
   int            np, xx;
   FILE           *fid;
   char           buff[1000];
   
   snprintf(buff,999,"%s/saved_parms",get_zhomedir());                           //  parameters file

   fid = fopen(buff,"r");
   if (! fid) {
      textwidget_append2(mLog,1," no parameter file, using defaults \n");
      return;
   }
   
   np = fscanf(fid," %d %d %d %d %d %d %d %d %d %d %d %d %d %d",                 //  test read
               &xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx,&xx);
   fclose(fid);

   if (np != Nparms) {
      textwidget_append2(mLog,1," bad parameter file, using defaults \n");
      return;
   }

   fid = fopen(buff,"r");                                                        //  read parameters
   np = fscanf(fid," %d %d %d %d %d %d %d %d %d %d %d %d %d %d",
               &Nthreads, &runtime, 
               &memKB[0], &memKB[1], &memKB[2], &memKB[3], &memKB[4],
               &fiboN, 
               &diskMB, &diskKB[0], &diskKB[1], &diskKB[2], &diskKB[3], 
               &memMB );
   fclose(fid);

   return;
}


//  save parameters for next session

void save_parms()
{
   FILE           *fid;
   char           buff[1000];

   snprintf(buff,999,"%s/saved_parms",get_zhomedir());                           //  parameters file

   fid = fopen(buff,"w");
   if (! fid) {
      textwidget_append2(mLog,1," cannot write parameter file \n");
      return;
   }
   
   fprintf(fid," %d %d \n", Nthreads, runtime);
   fprintf(fid," %d %d %d %d %d \n", memKB[0], memKB[1], memKB[2], memKB[3], memKB[4]);
   fprintf(fid," %d \n", fiboN);
   fprintf(fid," %d %d %d %d %d \n", diskMB, diskKB[0], diskKB[1], diskKB[2], diskKB[3]);
   fprintf(fid," %d \n", memMB);
   
   fclose(fid);
   return;
}


//  list all parameter data 

void m_list()                                                                    //  2.2
{
   textwidget_append2(mLog,1,"\n parm name      low    high    curr     description \n");

   for (int ii = 0; ii < Nparms; ii++)
      textwidget_append2(mLog,0," %-10s %7d %7d %7d     %s \n", parmtab[ii].name, 
              parmtab[ii].lolim, parmtab[ii].hilim, *parmtab[ii].parm,           //  add current value 1.6
              parmtab[ii].parmdesc);
   return;
}


//  do a meaningless loop to ramp-up CPU to full clock speed
//  (may not speed up instantly when a benchmark is started)

void rampCPU()
{
   double   time0, rr = 1;

   time0 = get_seconds();
   while (get_seconds() - time0 < 0.5) rr = rr + 1.0 / rr;
   return;
}


//  process menu or toolbar selection event

void menufunc(GtkWidget *, const char *menu)
{
   killThreads = 0;
   
   if (strmatch(menu,"bench")) popup_menu(mWin,benchmenu);
   if (strmatch(menu,"parms")) m_parms();
   if (strmatch(menu,"list")) m_list();
   if (strmatch(menu,"clear")) m_clear(); 
   if (strmatch(menu,"kill")) m_kill();
   if (strmatch(menu,"quit")) m_quit();
   if (strmatch(menu,"help")) m_help();
   if (strmatch(menu,"all")) m_all();
   if (strmatch(menu,"cpu")) m_cpu();
   if (strmatch(menu,"mem speed")) m_memspeed();
   if (strmatch(menu,"func call")) m_funccall();                                 //  3.2
   if (strmatch(menu,"matrix math")) m_matrix();                                 //  3.4
   if (strmatch(menu,"smp1")) m_smp1();
   if (strmatch(menu,"smp2")) m_smp2();
   if (strmatch(menu,"smp3")) m_smp3();
   if (strmatch(menu,"global lock")) m_global_lock();                            //  3.7
   if (strmatch(menu,"fibo")) m_fibo();
   if (strmatch(menu,"whetstone")) m_whetstone();
   if (strmatch(menu,"linpack")) m_linpack();
   if (strmatch(menu,"disk")) m_disk();
   if (strmatch(menu,"rpm")) m_rpm();
   if (strmatch(menu,"mem test")) m_memtest();
}


//  edit parameters

void m_parms()
{
   int            zstat, ii, lolim, hilim, ztemp;
   const char     *pname;
   zdialog        *zd;

/***
       _____________________________________________________
      |        Set Benchmark Parameters                     |
      |                                                     |
      | parallel threads  [___]                             |
      | benchmark run time, seconds  [___]                  |
      | memory bench size, KB [___] [___] [___] [___] [___] |
      | fibonacci bench number  [___]                       |
      | disk I/O file size, MB  [___]                       |
      | disk I/O record size, KB  [___] [___] [___] [___]   |
      | memory test size, MB [___]                          |
      |                                    [apply] [cancel] |
      |_____________________________________________________|      

***/

   zd = zdialog_new("Set Benchmark Parameters",mWin,"apply","cancel",null);
   zdialog_add_widget(zd,"hbox","hbth","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labth","hbth","parallel threads","space=3");
   zdialog_add_widget(zd,"entry","threads","hbth","1","size=3");

   zdialog_add_widget(zd,"hbox","hbsecs","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labsecs","hbsecs","benchmark run time, seconds","space=3");
   zdialog_add_widget(zd,"entry","runtime","hbsecs","1","size=3");

   zdialog_add_widget(zd,"hbox","hbmem","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labmem","hbmem","memory bench size, KB","space=3");
   zdialog_add_widget(zd,"entry","mem0KB","hbmem","10","size=3");
   zdialog_add_widget(zd,"entry","mem1KB","hbmem","100","size=3");
   zdialog_add_widget(zd,"entry","mem2KB","hbmem","1000","size=4");
   zdialog_add_widget(zd,"entry","mem3KB","hbmem","10000","size=5");
   zdialog_add_widget(zd,"entry","mem4KB","hbmem","100000","size=6");

   zdialog_add_widget(zd,"hbox","hbfibo","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labfibo","hbfibo","fibonacci bench number","space=3");
   zdialog_add_widget(zd,"entry","fiboN","hbfibo","40","size=4");

   zdialog_add_widget(zd,"hbox","hbdiskMB","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labdiskMB","hbdiskMB","disk I/O file size, MB","space=3");
   zdialog_add_widget(zd,"entry","diskMB","hbdiskMB","100","size=4");

   zdialog_add_widget(zd,"hbox","hbdiskrec","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labdiskrec","hbdiskrec","disk I/O record size, KB","space=3");
   zdialog_add_widget(zd,"entry","disk0KB","hbdiskrec","1","size=3");
   zdialog_add_widget(zd,"entry","disk1KB","hbdiskrec","10","size=4");
   zdialog_add_widget(zd,"entry","disk2KB","hbdiskrec","100","size=5");
   zdialog_add_widget(zd,"entry","disk3KB","hbdiskrec","1000","size=6");

   zdialog_add_widget(zd,"hbox","hbmemtest","dialog",0,"space=3");
   zdialog_add_widget(zd,"label","labmemtest","hbmemtest","memory test size, MB","space=3");
   zdialog_add_widget(zd,"entry","memMB","hbmemtest","500","size=6");

   for (ii = 0; ii < Nparms; ii++)                                               //  stuff current parameters
      zdialog_stuff(zd,parmtab[ii].name,*(parmtab[ii].parm));

repeat_dialog:

   zdialog_run(zd,0,"parent");                                                   //  run dialog, blocking
   zstat = zdialog_wait(zd);

   if (zstat != 1) {
      zdialog_free(zd);                                                          //  cancel
      return;
   }
   
   for (ii = 0; ii < Nparms; ii++)                                               //  get revised parameters
   {
      pname = parmtab[ii].name;
      lolim = parmtab[ii].lolim;
      hilim = parmtab[ii].hilim;

      zdialog_fetch(zd,pname,ztemp);
      if (ztemp < lolim || ztemp > hilim) {
         textwidget_append2(mLog,0," *** %s must be %d-%d *** \n",pname,lolim,hilim);
         zdialog_goto(zd,pname);
         zd->zstat = 0;
         goto repeat_dialog;
      }
      else *(parmtab[ii].parm) = ztemp;
   }
   
   zdialog_free(zd);
   return;
}


//  clear the window

void m_clear()
{
   gtk_text_buffer_set_text(textBuff,"", -1);
   return;
}


//  tell running threads to quit

void m_kill()
{
   textwidget_append2(mLog,0,"\n kill ... \n");
   if (NTbusy) {
      killThreads = 1;
      while (NTbusy) zsleep(0.1);
      killThreads = 0;
   }
   else {
      wprintz("\n ready\n",0);                                                   //  2.4
      wprintz("\n",0);
   }
}


//  quit program

void m_quit()
{
   save_parms();
   gtk_main_quit();
   return;
}


//  show user guide in a new window

void m_help()
{
   shell_ack("xdg-open %s/userguide",get_zdatadir());
   return;
}


/********************************************************************************/

//  supply unused zdialog() callback function

void KBevent(GdkEventKey *event) 
{ return; }


/********************************************************************************/

//  Allow only one thread at a time through a function that is otherwise 
//  not thread-safe. Function must call these functions at entry and exit.

mutex_t gate_threads_mutex = PTHREAD_MUTEX_INITIALIZER;

void gate_threads_enter()
{
   mutex_lock(&gate_threads_mutex);
   return;
}

void gate_threads_leave()
{
   mutex_unlock(&gate_threads_mutex);
   return;
}


/********************************************************************************/

//  start all benchmarks (except rpm)
//  do all benchmarks in sequence except rpm and mem test

void m_all()
{
   zadd_locked(NTbusy,+1);
   start_detached_thread(all_thread, 0);
}


//  thread for running all benchmarks

void * all_thread(void *varg)
{
   m_cpu();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_memspeed();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_funccall();                                                                 //  3.2
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_matrix();                                                                   //  3.4
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_smp1();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_smp2();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_smp3();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_global_lock();                                                              //  3.7
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_fibo();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_whetstone();                                                                //  2.0
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_linpack();                                                                  //  2.1
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

   m_disk();
   while (NTbusy > 1) sleep(1);
   if (killThreads) goto exit_thread;

exit_thread:
   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  loopy 
//  This nutty thing stops thread 0 from being sparsely scheduled when the 
//  first benchmark is started. Mystery lies somewhere in the Linux kernel.

void m_loopy()                                                                   //  3.3
{
   void * loopy_thread(void *);

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(loopy_thread, &tid[ii]);
   }

   while (NTbusy > 1) sleep(1);
   return;
}

void * loopy_thread(void *varg)
{
   synch_threads(0);
   zadd_locked(NTbusy,-1);
   return 0;       
}


/********************************************************************************/

//  CPU benchmark

double   Drandom[10000];                                                         //  10,000 random numbers, 0.0 - 0.999...
int      Irandom[10000];                                                         //  10,000 random numbers, 0 - 999

void m_cpu()
{
   FILE     *fid;
   char     buff[100], message[100], *pp;
   
   m_loopy();                                                                    //  3.3

   fid = fopen("/proc/cpuinfo","r");                                             //  output CPU data
   if (fid) 
   {
      while (true) 
      {
         pp = fgets_trim(buff,100,fid);
         if (! pp) break;
         if (strncmp(pp,"model name",10) != 0) continue;
         while (pp && *pp != ':') pp++;
         snprintf(message,100,"\n %s \n\n",pp+2);
         wprintz(message,0);
         break;
      }

      fclose(fid);
   }
   
   for (int ii = 0; ii < 10000; ii++)
   {
      Drandom[ii] = drandz();
      Irandom[ii] = 1000 * drandz();
   }
   
   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(cpu_thread, &tid[ii]);
   }
}


//  Thread for CPU benchmark.
//  CPU performance for various operations.
//  Compiler optimizations should be disabled.

void * cpu_thread(void *varg)
{
   static double  loop_time, mops[10];

   double      secs, time0, count;
   int         ii, jj, kk;
   double      dd, ee, djj, dkk;
   char        message1[100], message2[20];

   int th = * ((int *) varg);

   synch_threads(0);                                                             //  start threads together
   rampCPU();                                                                    //  ramp-up CPU clock

   if (th == 0) wprintz("\n CPU Performance, million/sec \n",1);

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  integer add/subtract
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 0; ii < 10000; ii++)
      {
         jj = kk = Irandom[ii];
         jj = jj + ii;
         jj = jj + kk + 1;
         jj = jj - kk - 2;
         jj = jj + kk + 3;
         jj = jj - kk - 4;
         jj = jj + kk + 5;
         jj = jj - kk - 6;
         kk++;
         jj = jj + ii;
         jj = jj + kk + 1;
         jj = jj - kk - 2;
         jj = jj + kk + 3;
         jj = jj - kk - 4;
         jj = jj + kk + 5;
         jj = jj - kk - 6;
         kk++;
         jj = jj + ii;
         jj = jj + kk + 1;
         jj = jj - kk - 2;
         jj = jj + kk + 3;
         jj = jj - kk - 4;
         jj = jj + kk + 5;
         jj = jj - kk - 6;
         kk++;
         jj = jj + ii;
         jj = jj + kk + 1;
         jj = jj - kk - 2;
         jj = jj + kk + 3;
         jj = jj - kk - 4;
         jj = jj + kk + 5;
         jj = jj - kk - 6;
         kk++;
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 60.0 / loop_time / 1000000.0;

   synch_threads(0);                                                             //  wait for all threads done
   if (killThreads) goto thread_exit;

   if (th == 0)                                                                  //  one thread reports
   {
      sprintf(message1,"  integer add/subtract    ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.0f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  integer multiply/divide
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 1; ii < 10000; ii++)
      {
         jj = Irandom[ii];
         jj = jj * ii * 10 / 11;
         jj = jj * 33 / 37;
         jj = jj * 100 / 111;
         jj = jj * 3 / ii;
         jj = jj * ii * 10 / 11;
         jj = jj * 33 / 37;
         jj = jj * 100 / 111;
         jj = jj * 3 / ii;
         jj = jj * ii * 10 / 11;
         jj = jj * 33 / 37;
         jj = jj * 100 / 111;
         jj = jj * 3 / ii;
         jj = jj * ii * 10 / 11;
         jj = jj * 33 / 37;
         jj = jj * 100 / 111;
         jj = jj * 3 / ii;
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 36.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  integer multiply/divide ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.0f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  double add/subtract
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 0; ii < 10000; ii++)
      {
         djj = dkk = Drandom[ii];
         djj = djj + dkk + 1.0;
         djj = djj - dkk - 2.0;
         djj = djj + dkk + 3.0;
         djj = djj - dkk - 4.0;
         djj = djj + dkk + 5.0;
         djj = djj - dkk - 6.0;
         dkk = dkk + 4.0;
         djj = djj + dkk + 1.0;
         djj = djj - dkk - 2.0;
         djj = djj + dkk + 3.0;
         djj = djj - dkk - 4.0;
         djj = djj + dkk + 5.0;
         djj = djj - dkk - 6.0;
         dkk = dkk + 4.0;
         djj = djj + dkk + 1.0;
         djj = djj - dkk - 2.0;
         djj = djj + dkk + 3.0;
         djj = djj - dkk - 4.0;
         djj = djj + dkk + 5.0;
         djj = djj - dkk - 6.0;
         dkk = dkk + 4.0;
         djj = djj + dkk + 1.0;
         djj = djj - dkk - 2.0;
         djj = djj + dkk + 3.0;
         djj = djj - dkk - 4.0;
         djj = djj + dkk + 5.0;
         djj = djj - dkk - 6.0;
         dkk = dkk + 4.0;
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 52.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  float-64 add/subtract   ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.0f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  double multiply/divide
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 1; ii < 10000; ii++)
      {
         djj = Drandom[ii];
         djj = djj * 10.0 / 11.0;
         djj = djj * 0.9988;
         djj = djj * 0.90345;
         djj = djj * 1.2191;
         djj = djj * 10.0 / 11.0;
         djj = djj * 0.9988;
         djj = djj * 0.90345;
         djj = djj * 1.2191;
         djj = djj * 10.0 / 11.0;
         djj = djj * 0.9988;
         djj = djj * 0.90345;
         djj = djj * 1.2191;
         djj = djj * 10.0 / 11.0;
         djj = djj * 0.9988;
         djj = djj * 0.90345;
         djj = djj * 1.2191;
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 20.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  float-64 multiply/divide");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.0f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  sqrt() function
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 0; ii < 10000; ii++)
      {
         dd = Drandom[ii];
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         dd = dd + sqrt(dd) + sqrt(dd + 10) + sqrt(dd + 100);
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 30.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  sqrt() function         ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.1f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  pow() function
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 0; ii < 10000; ii++)
      {
         dd = Drandom[ii] * 100;
         dd = pow(dd,0.5);
         dd = pow(dd,2.0);
         dd = pow(dd,4.0);
         dd = pow(dd,0.5);
         dd = pow(dd,2.0);
         dd = pow(dd,4.0);
         dd = pow(dd,0.5);
         dd = pow(dd,2.0);
         dd = pow(dd,4.0);
         dd = pow(dd,0.5);
         dd = pow(dd,2.0);
         dd = pow(dd,4.0);
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 12.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  pow() function          ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.1f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  sin() function
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      dd = -3.0;
      ee = 0;

      for (ii = 0; ii < 10000; ii++)
      {
         dd += 0.0001;
         if (dd > 3) dd = -3;
         ee += sin(dd) + sin(dd + 0.01) + sin(dd + 0.02);
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 3.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  sin() function          ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.1f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

   secs = count = 0.0;                                                           //  asin() function
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      dd = -1;
      ee = 0;

      for (ii = 0; ii < 10000; ii++)
      {
         dd += 0.0001;
         if (dd > 1) dd = -1;
         ee += asin(dd) + asin(dd + 0.01) + asin(dd + 0.02);
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops[th] = 3.0 / loop_time / 1000000.0;

   synch_threads(0);
   if (killThreads) goto thread_exit;

   if (th == 0)
   {
      sprintf(message1,"  asin() function         ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %7.1f", mops[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

// ------------------------------------------------------------------------------

thread_exit:
   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start memory benchmark

void m_memspeed()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(mem_thread, &tid[ii]);
   }
}


//  thread for memory benchmark

void * mem_thread(void *varg)
{
   int            th, bs, ii, jj, nn;
   double         fbytes, secs, time0;
   char           *buff1, *buff2;
   int32          *num1, *num2, *rnum;
   char           message1[100], message2[20];
   static double  MBs[10];

   th = * ((int *) varg);

   //  block move memory performance

   if (th == 0)
   {
      wprintz("\n Block Move Memory Performance \n",1);
      wprintz("  memory range    megabytes/sec  \n",1);
   }

   rampCPU();

   for (bs = 0; bs < 5; bs++)                                                    //  loop for each memory record size
   {
      nn = 1024 * memKB[bs];                                                     //  allocate two memory blocks
      buff1 = (char *) zmalloc(nn);
      buff2 = (char *) zmalloc(nn);

      secs = 0.0;
      fbytes = 0.0;
      time0 = get_seconds();

      while (secs < runtime && ! killThreads)
      {
         memmove(buff1,buff2,nn);
         fbytes += nn;
         secs = get_seconds() - time0;
      }

      MBs[th] = fbytes / secs / 1000000.0;

      synch_threads(0);
      if (killThreads) break;

      if (th == 0)
      {
         sprintf(message1,"%9d KB   ",memKB[bs]);
         for (int ii = 0; ii < Nthreads; ii++) {
            sprintf(message2," %7.0f",MBs[ii]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }

      zfree(buff1);                                                              //  free memory
      zfree(buff2);
   }

   //  sequential 4-char get/put memory performance                              //  2.6

   if (th == 0)
   {
      wprintz("\n Sequential int-32 get/put Memory Performance \n",1);
      wprintz("  memory range    megabytes/sec  \n",1);
   }

   for (bs = 0; bs < 5; bs++)                                                    //  loop for each memory record size
   {
      nn = 1024 * memKB[bs];                                                     //  allocate two memory blocks
      num1 = (int32 *) zmalloc(nn);
      num2 = (int32 *) zmalloc(nn);

      rnum = (int32 *) zmalloc(nn);

      for (ii = 0; ii < nn/4; ii++)                                              //  nn/4 sequential values 0..nn/4-1
         rnum[ii] = ii;

      secs = 0.0;
      fbytes = 0.0;
      time0 = get_seconds();

      while (secs < runtime && ! killThreads)
      {
         for (ii = 0; ii < nn/4; ii++) {
            jj = rnum[ii];
            num1[jj] = num2[jj];
         }

         fbytes += nn;
         secs = get_seconds() - time0;
      }

      MBs[th] = fbytes / secs / 1000000.0;

      synch_threads(0);
      if (killThreads) break;

      if (th == 0)
      {
         sprintf(message1,"%9d KB   ",memKB[bs]);
         for (int ii = 0; ii < Nthreads; ii++) {
            sprintf(message2," %7.0f",MBs[ii]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }

      zfree(num1);                                                               //  free memory
      zfree(num2);
      zfree(rnum);
   }

   //  random 4-char get/put memory performance                                  //  2.4

   if (th == 0)
   {
      wprintz("\n Random int-32 get/put Memory Performance \n",1);
      wprintz("  memory range    megabytes/sec  \n",1);
   }

   for (bs = 0; bs < 5; bs++)                                                    //  loop for each memory record size
   {
      nn = 1024 * memKB[bs];                                                     //  allocate two memory blocks
      num1 = (int32 *) zmalloc(nn);
      num2 = (int32 *) zmalloc(nn);

      rnum = (int32 *) zmalloc(nn);
      
      for (ii = 0; ii < nn/4; ii++)                                              //  nn/4 random values range 0..nn/4-1
         rnum[ii] = rand() % nn/4;

      secs = 0.0;
      fbytes = 0.0;
      time0 = get_seconds();

      while (secs < runtime && ! killThreads)
      {
         for (ii = 0; ii < nn/4; ii++) {
            jj = rnum[ii];
            num1[jj] = num2[jj];
         }

         fbytes += nn;
         secs = get_seconds() - time0;
      }

      MBs[th] = fbytes / secs / 1000000.0;

      synch_threads(0);
      if (killThreads) break;

      if (th == 0)
      {
         sprintf(message1,"%9d KB   ",memKB[bs]);
         for (int ii = 0; ii < Nthreads; ii++) {
            sprintf(message2," %7.0f",MBs[ii]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }

      zfree(num1);                                                               //  free memory
      zfree(num2);
      zfree(rnum);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start function call benchmark - call/return time for simple function         //  3.2

void m_funccall()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(funccall_thread, &tid[ii]);
   }
   
   return;
}


//  thread for function call benchmark

void * funccall_thread(void *varg)
{
   int  funccall_function(int,int);

   int            th, ii;
   char           message1[100], message2[20];
   double         time0, secs;
   int            funccall_value = 0;
   static double  rate[10];
   
   th = * ((int *) varg);

   if (th == 0) wprintz("\n Function Call benchmark \n",1);

   rampCPU();

   secs = 0.0;
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      for (ii = 0; ii < 100; ii++) {
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
         funccall_value = funccall_function(funccall_value,1);
      }

      secs = get_seconds() - time0;
   }
   
   secs = get_seconds() - time0;
   rate[th] = funccall_value / secs / 1000000.0;
   
   synch_threads(0);
   
   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  call rate, million/sec: ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %6.1f",rate[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


int funccall_function(int n1, int n2)
{
   return n1 + n2;
}


/********************************************************************************/

//  start 2D matrix math benchmark

void m_matrix()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(matrix_thread, &tid[ii]);
   }
}


//  thread for matrix benchmark

void * matrix_thread(void *varg)
{
   int matrix_function();

   int            th, ii;
   char           message1[100], message2[20];
   double         time0, secs;
   int            count = 0;
   static double  rate[10];
   
   th = * ((int *) varg);

   if (th == 0) wprintz("\n 2D matrix math benchmark \n",1);

   rampCPU();

   secs = 0.0;
   time0 = get_seconds();

   while (secs < runtime && ! killThreads)
   {
      count += matrix_function();
      secs = get_seconds() - time0;
   }
   
   secs = get_seconds() - time0;
   rate[th] = count / secs;
   
   synch_threads(0);
   
   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  computations/sec: ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %6.1f",rate[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


int matrix_function()
{
   int      ii, jj, reps;
   double   matrix[1000][1000];
   
   for (ii = 0; ii < 1000; ii++)
   for (jj = 0; jj < 1000; jj++)
      matrix[ii][jj] = 10.0 * drandz();

   for (reps = 0; reps < 100; reps++)
   for (ii = 0; ii < 1000; ii++)
   for (jj = 0; jj < 1000; jj++)
      matrix[ii][jj] = matrix[ii][jj] + matrix[ii][jj] * matrix[ii][jj];

   return reps;
}


/********************************************************************************/

//  start SMP-1 benchmark

void m_smp1()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(smp1_thread, &tid[ii]);
   }
}


//  thread for SMP-1 benchmark

void * smp1_thread(void *varg)
{
   int            ii, th;
   static int     count[10];
   double         time0, time1, total;
   static double  rate[10];
   char           message1[100], message2[20];
   
   th = * ((int *) varg);

   if (th == 0) 
      wprintz("\n SMP-1 benchmark: thread switch rates \n",1);
   
   rampCPU();

   time0 = get_seconds();
   time1 = time0 + runtime;
   count[th] = 0;
   
   while (get_seconds() < time1 && ! killThreads)                                //  increment mutex-locked counter
      zadd_locked(count[th],+1);

   rate[th] = 1.0 * count[th] / runtime;
   
   synch_threads(0);

   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  threads: ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2,"%.0f ",rate[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
      
      total = 0;
      for (ii = 0; ii < Nthreads; ii++)
         total += count[ii];
      total = total / runtime;
      sprintf(message1,"    total: %.0f / sec. \n",total);
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start SMP-2 benchmark

void m_smp2()
{
   m_loopy();                                                                    //  3.3
   zadd_locked(NTbusy,+1);
   start_detached_thread(smp2_thread1,0);
   return;
}


//  threads for SMP-2 benchmark

void * smp2_thread1(void *varg)
{
   double   starts, secs, time0, rate;
   int      NTbase = NTbusy;
   char     message[100];
   
   wprintz("\n SMP-2 benchmark: thread start/complete rates \n",1);
   sprintf(message,"  parallel threads: %d \n",Nthreads);
   wprintz(message,0);

   rampCPU();
   starts = 0;
   time0 = get_seconds();
   secs = 0.0;

   while (secs < runtime && ! killThreads)
   {
      if (NTbusy < Nthreads + NTbase) {
         starts++;
         zadd_locked(NTbusy,+1);
         start_detached_thread(smp2_thread2,0);
      }

      secs = get_seconds() - time0;
   }
   
   rate = starts / secs;
   sprintf(message,"    total: %.0f / sec \n",rate);
   wprintz(message,0);

   while (NTbusy > NTbase) zsleep(0.1);
   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   pthread_exit(0);
   return 0;       
}


void * smp2_thread2(void *varg)
{
   zadd_locked(NTbusy,-1);
   pthread_exit(0);
   return 0;
}


/********************************************************************************/

//  start SMP-3 benchmark

void m_smp3()                                                                    //  1.1
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(smp3_thread, &tid[ii]);
   }
}


//  thread for SMP-3 benchmark

void * smp3_thread(void *varg)                                                   //  1.1
{
   int         ii, th, total, err = 0;
   double      secs, time0, rate;
   static int  smp3_count[10];
   char        message[100];
   
   th = * ((int *) varg);

   if (th == 0) {
      wprintz("\n SMP-3 benchmark: process creation/complete rates \n",1);
      sprintf(message,"  parallel processes: %d \n",Nthreads);
      wprintz(message,0);
   }

   rampCPU();
   time0 = get_seconds();
   secs = 0.0;
   smp3_count[th] = 0;

   while (secs < runtime && ! killThreads)
   {
      ++smp3_count[th];
      err = zsystem("echo -n");
      secs = get_seconds() - time0;
   }

   if (err) printf("error: %s \n",strerror(err));   

   synch_threads(0);
   
   if (th == 0 && ! killThreads)
   {
      total = 0;
      for (ii = 0; ii < Nthreads; ii++)
         total += smp3_count[ii];
      rate = 1.0 * total / secs;
      sprintf(message,"    total: %.0f / sec \n",rate);
      wprintz(message,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start global lock benchmark

void m_global_lock()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(global_lock_thread, &tid[ii]);
   }
}


//  thread for global lock benchmark

void * global_lock_thread(void *varg)
{
   int            fd, ii, th;
   static int     count[10];
   double         time0, time1, total;
   static double  rate[10];
   cchar          *lockname = "/tmp/lbench_global_lock";
   char           message1[100], message2[20];
   
   th = * ((int *) varg);

   if (th == 0) 
      wprintz("\n global lock benchmark: lock/unlock rates \n",1);
   
   rampCPU();

   time0 = get_seconds();
   time1 = time0 + runtime;
   count[th] = 0;
   
   while (get_seconds() < time1 && ! killThreads)                                //  increment counter
   {
      while ((fd = global_lock(lockname)) < 0)                                   //  wait for lock 
      {  /***  do nothing  ***/  }
      count[th]++;                                                               //  count locks made
      global_unlock(fd,lockname);                                                //  unlock
   }

   rate[th] = 1.0 * count[th] / runtime;
   
   synch_threads(0);

   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  threads: ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2,"%.0f ",rate[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
      
      total = 0;
      for (ii = 0; ii < Nthreads; ii++)
         total += count[ii];
      total = total / runtime;
      sprintf(message1,"    total: %.0f / sec. \n",total);
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start fibonacci benchmark

void m_fibo()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(fibo_thread, &tid[ii]);
   }
}


//  thread for fibonacci benchmark

void * fibo_thread(void *varg)
{
   int            th, ii;
   char           header[100];
   char           message1[100], message2[20];
   double         time0;
   double         fibo_function(int);
   double         fiboNum;
   static double  secs[10];
   
   th = * ((int *) varg);

   if (th == 0) {
      snprintf(header,99,"\n Fibonacci benchmark: %d \n",fiboN);
      wprintz(header,1);
   }

   rampCPU();

   time0 = get_seconds();
   fiboNum = fibo_function(fiboN);
   secs[th] = get_seconds() - time0;
   
   synch_threads(0);
   
   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  fibonacci value: %.0f \n",fiboNum);
      wprintz(message1,0);

      sprintf(message1,"  seconds: ");
      for (ii = 0; ii < Nthreads; ii++) {
         sprintf(message2," %5.1f",secs[ii]);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


//  fibonacci function

double fibo_function(int kk)
{
   if (killThreads) return 0;
   if (kk > 1) return fibo_function(kk-1) + fibo_function(kk-2);
   if (kk == 1) return 1;
   return 0;
}


/********************************************************************************/

//  start whetstone benchmark

void m_whetstone()                                                               //  2.0
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(whetstone_thread, &tid[ii]);
   }
}


//  thread for whetstone benchmark

void * whetstone_thread(void *varg)                                              //  2.0
{
   double whetstone_function(int reps);

   int            th, ii;
   char           header[100];
   char           message1[100], message2[20];
   double         time0, mips;
   static double  whets[10], secs[10];
   
   th = * ((int *) varg);

   if (th == 0) {
      snprintf(header,99,"\n Whetstone benchmark \n");
      wprintz(header,1);
   }

   rampCPU();

   whets[th] = 0;
   secs[th] = 0.0;
   time0 = get_seconds();

   while (secs[th] < runtime && ! killThreads)
   {
      whets[th] += whetstone_function(1);
      secs[th] = get_seconds() - time0;
   }

   synch_threads(0);
   
   if (th == 0 && ! killThreads)
   {
      sprintf(message1,"  whetstone MIPS: ");

      for (ii = 0; ii < Nthreads; ii++) {
         mips = whets[ii] / secs[ii] / 1000000.0;
         sprintf(message2,"%6.0f",mips);
         strcat(message1,message2);
      }
      strcat(message1,"\n");
      wprintz(message1,0);
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************

   whetstone benchmark function

   Original benchmark program (Painter Engineering) was converted to
   a callable function and modified for the GNU C compiler and to be 
   thread safe. Must be compiled with zero optimizations (g++ -O0).
      Mike Cornelison, Oct. 2012

*********************************************************************************
*
*  C Converted Whetstone Double Precision Benchmark
*		Version 1.2	22 March 1998
*
*	(c) Copyright 1998 Painter Engineering, Inc.
*		All Rights Reserved.
*
*		Permission is granted to use, duplicate, and
*		publish this text and program as long as it
*		includes this entire comment block and limited
*		rights reference.
*
*  Converted by Rich Painter, Painter Engineering, Inc. based on the
*  www.netlib.org benchmark/whetstoned version obtained 16 March 1998.
*
*  A novel approach was used here to keep the look and feel of the
*  FORTRAN version.  Altering the FORTRAN-based array indices,
*  starting at element 1, to start at element 0 for C, would require
*  numerous changes, including decrementing the variable indices by 1.
*  Instead, the array E1[] was declared 1 element larger in C.  This
*  allows the FORTRAN index range to function without any literal or
*  variable indices changes.  The array element E1[0] is simply never
*  used and does not alter the benchmark results.
*
*  The major FORTRAN comment blocks were retained to minimize
*  differences between versions.  Modules N5 and N12, like in the
*  FORTRAN version, have been eliminated here.
*
*  Questions and comments may be directed to the author at
*			r.painter@ieee.org
*
*********************************************************************************/

//  This function executes 1 billion whetstones * REPS
//  and returns the total whetstones executed.

double whetstone_function(int REPS)                                              //  2.0
{
   /* map the FORTRAN math functions, etc. to the C versions */
   #define DSIN	sin
   #define DCOS	cos
   #define DATAN	atan
   #define DLOG	log
   #define DEXP	exp
   #define DSQRT	sqrt
   #define IF		if

   /* function prototypes */
   void PA(double E[], double T, double T2);
   void P0(double E[], int J, int K, int L);
   void P3(double T, double T2, double X, double Y, double *Z);

   double T,T1,T2,E1[5];
   int I,J,K,L;
	int N1, N2, N3, N4, N6, N7, N8, N9, N10, N11;
	double X1,X2,X3,X4,X,Y,Z;
	int LOOP;
	int  II, JJ;
	float WHETS;

//  The actual benchmark starts here.

	T  = .499975;
	T1 = 0.50025;
	T2 = 2.0;

//  With LOOP=10000, one billion Whetstone instructions are executed.

	LOOP = 10000;
	II   = REPS;
	JJ = 1;

IILOOP:
   if (killThreads) return 0;
	N1  = 0;
	N2  = 12 * LOOP;
	N3  = 14 * LOOP;
	N4  = 345 * LOOP;
	N6  = 210 * LOOP;
	N7  = 32 * LOOP;
	N8  = 899 * LOOP;
	N9  = 616 * LOOP;
	N10 = 0;
	N11 = 93 * LOOP;

//  Module 1: Simple identifiers

	X1  =  1.0;
	X2  = -1.0;
	X3  = -1.0;
	X4  = -1.0;

	for (I = 1; I <= N1; I++) {
	    X1 = (X1 + X2 + X3 - X4) * T;
	    X2 = (X1 + X2 - X3 + X4) * T;
	    X3 = (X1 - X2 + X3 + X4) * T;
	    X4 = (-X1+ X2 + X3 + X4) * T;
	}

//  Module 2: Array elements

	E1[1] =  1.0;
	E1[2] = -1.0;
	E1[3] = -1.0;
	E1[4] = -1.0;

	for (I = 1; I <= N2; I++) {
	    E1[1] = ( E1[1] + E1[2] + E1[3] - E1[4]) * T;
	    E1[2] = ( E1[1] + E1[2] - E1[3] + E1[4]) * T;
	    E1[3] = ( E1[1] - E1[2] + E1[3] + E1[4]) * T;
	    E1[4] = (-E1[1] + E1[2] + E1[3] + E1[4]) * T;
	}

//  Module 3: Array as parameter

	for (I = 1; I <= N3; I++)
		PA(E1,T,T2);

//  Module 4: Conditional jumps

	J = 1;
	for (I = 1; I <= N4; I++) {
		if (J == 1) J = 2;
		else J = 3;
		if (J > 2) J = 0;
		else J = 1;
		if (J < 1) J = 1;
		else J = 0;
	}

//  Module 5: Omitted
//  Module 6: Integer arithmetic

	J = 1;
	K = 2;
	L = 3;

	for (I = 1; I <= N6; I++) {
	    J = J * (K-J) * (L-K);
	    K = L * K - (L-J) * K;
	    L = (L-K) * (K+J);
	    E1[L-1] = J + K + L;
	    E1[K-1] = J * K * L;
	}

//  Module 7: Trigonometric functions

	X = 0.5;
	Y = 0.5;

	for (I = 1; I <= N7; I++) {
		X = T * DATAN(T2*DSIN(X)*DCOS(X)/(DCOS(X+Y)+DCOS(X-Y)-1.0));
		Y = T * DATAN(T2*DSIN(Y)*DCOS(Y)/(DCOS(X+Y)+DCOS(X-Y)-1.0));
	}

//  Module 8: Procedure calls

	X = 1.0;
	Y = 1.0;
	Z = 1.0;

	for (I = 1; I <= N8; I++)
		P3(T,T2,X,Y,&Z);

//  Module 9: Array references

	J = 1;
	K = 2;
	L = 3;
	E1[1] = 1.0;
	E1[2] = 2.0;
	E1[3] = 3.0;

	for (I = 1; I <= N9; I++) 
	   P0(E1,J,K,L);

//  Module 10: Integer arithmetic

	J = 2;
	K = 3;

	for (I = 1; I <= N10; I++) {
	    J = J + K;
	    K = J + K;
	    J = K - J;
	    K = K - J - J;
	}

//  Module 11: Standard functions

	X = 0.75;

	for (I = 1; I <= N11; I++)
		X = DSQRT(DEXP(DLOG(X)/T1));

//  THIS IS THE END OF THE MAJOR LOOP.

	if (++JJ <= II) goto IILOOP;

//  Stop benchmark timing at this point.

	WHETS = (100000.0 * LOOP * II);
	return WHETS;
}

void PA(double E[], double T, double T2)
{
   for (int J = 0; J < 6; J++)
   {
      E[1] = ( E[1] + E[2] + E[3] - E[4]) * T;
      E[2] = ( E[1] + E[2] - E[3] + E[4]) * T;
      E[3] = ( E[1] - E[2] + E[3] + E[4]) * T;
      E[4] = (-E[1] + E[2] + E[3] + E[4]) / T2;
	}
}

void P0(double E[], int J, int K, int L)
{
	E[J] = E[K];
	E[K] = E[L];
	E[L] = E[J];
}

void P3(double T, double T2, double X, double Y, double *Z)
{
	double X1, Y1;

	X1 = X;
	Y1 = Y;
	X1 = T * (X1 + Y1);
	Y1 = T * (X1 + Y1);
	*Z  = (X1 + Y1) / T2;
}


/********************************************************************************/

//  start linpack benchmark

void m_linpack()                                                                 //  2.1
{
   m_loopy();                                                                    //  3.3
   zadd_locked(NTbusy,+1);
   start_detached_thread(linpack_thread, &tid[0]);
}


//  thread for linpack benchmark

void * linpack_thread(void *varg)                                                //  2.1
{
   double linpack_function();

   char           message[100];
   double         mflops;
   
   sprintf(message,"\n linpack benchmark \n");
   wprintz(message,1);

   rampCPU();
   mflops = linpack_function();
   
   sprintf(message,"  linpack Mflops: %7.1f \n",mflops);
   wprintz(message,0);

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************

  Linpack double precision benchmark function

  Code was adapted from Roy Longbottom:
    www.roylongbottom.org.uk

  The original main() was made into a thread-safe function.
  Compile with no optimization to get CPU performance
    instead of compiler performance.

  Function returns total linpack Mflops.

*********************************************************************************/

#define ZERO 0.0e0
#define ONE 1.0e0
#define PREC "Double"
#define NTIMES 10

void matgen (double a[], int lda, int n, double b[], double *norma);
void dgefa (double a[], int lda, int n, int ipvt[], int *info);
void dgesl (double a[],int lda,int n,int ipvt[],double b[],int job);
void dmxpy (int n1, double y[], int n2, int ldm, double x[], double m[]);
void daxpy (int n, double da, double dx[], int incx, double dy[], int incy);
double epslon (double x);
int idamax (int n, double dx[], int incx);
void dscal (int n, double da, double dx[], int incx);
double ddot (int n, double dx[], int incx, double dy[], int incy);

double runSecs = 0.5;


double linpack_function()                                                        //  2.1
{
        double start_time, elapsed;
        double atime[6][15];                                                     //  3.2
        double aa[200*200],a[200*201],b[200],x[200];       
        double cray,ops,total,norma,normx;
        double resid,tm2;
        double mflops;
        int ipvt[200],n,i,j,ntimes,info,lda,ldaa;
        int pass, loop;
        double overhead1, overhead2, time2;

        lda = 201;
        ldaa = 200;
        cray = .056; 
        n = 100;

        ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
        
        matgen(a,lda,n,b,&norma);
        start_time = get_seconds();
        dgefa(a,lda,n,ipvt,&info);
        elapsed = get_seconds() - start_time;
        atime[0][0] = elapsed;
        start_time = get_seconds();
        dgesl(a,lda,n,ipvt,b,0);
        elapsed = get_seconds() - start_time;
        atime[1][0] = elapsed;
        total = atime[0][0] + atime[1][0];

/*     compute a residual to verify results.  */ 

        for (i = 0; i < n; i++) {
                x[i] = b[i];
        }
        matgen(a,lda,n,b,&norma);
        for (i = 0; i < n; i++) {
                b[i] = -b[i];
        }
        dmxpy(n,b,n,lda,x,a);
        resid = 0.0;
        normx = 0.0;
        for (i = 0; i < n; i++) {
                resid = (resid > fabs((double)b[i])) 
                        ? resid : fabs((double)b[i]);
                normx = (normx > fabs((double)x[i])) 
                        ? normx : fabs((double)x[i]);
        }
        
        atime[2][0] = total;
        if (total > 0.0)
        {
            atime[3][0] = ops/(1.0e6*total);
            atime[4][0] = 2.0/atime[3][0];
        }
        else
        {
            atime[3][0] = 0.0;
            atime[4][0] = 0.0;
        }
        atime[5][0] = total/cray;
       

/************************************************************************
 *       Calculate overhead of executing matgen procedure               *
 ************************************************************************/
       
        pass = -20;
        loop = NTIMES;
        do
        {
            start_time = get_seconds();
            pass = pass + 1;        
            for ( i = 0 ; i < loop ; i++)
            {
                 matgen(a,lda,n,b,&norma);
            }
            elapsed = get_seconds() - start_time;
            overhead1 = elapsed;

            if (overhead1 > runSecs)
            {
                pass = 0;
            }
            if (pass < 0)
            {
                if (overhead1 < 0.1)
                {
                    loop = loop * 10;
                }
                else
                {
                    loop = loop * 2;
                }
            }
        }
        while (pass < 0);
        
        overhead1 = overhead1 / (double)loop;


/************************************************************************
 *           Calculate matgen/dgefa passes for runSecs seconds          *
 ************************************************************************/
       
        pass = -20;
        ntimes = NTIMES;
        do
        {
            start_time = get_seconds();
            pass = pass + 1;        
            for ( i = 0 ; i < ntimes ; i++)
            {
                matgen(a,lda,n,b,&norma);
                dgefa(a,lda,n,ipvt,&info );
            }
            elapsed = get_seconds() - start_time;
            time2 = elapsed;

            if (time2 > runSecs)
            {
                pass = 0;
            }
            if (pass < 0)
            {
                if (time2 < 0.1)
                {
                    ntimes = ntimes * 10;
                }
                else
                {
                    ntimes = ntimes * 2;
                }
            }
        }
        while (pass < 0);
        
        ntimes =  (int)(runSecs * (double)ntimes / time2);
        if (ntimes == 0) ntimes = 1;


/************************************************************************
 *                              Execute 5 passes                        *
 ************************************************************************/
      
        tm2 = ntimes * overhead1;
        atime[3][6] = 0;

        for (j=1 ; j<6 ; j++)
        {
            start_time = get_seconds();
            for (i = 0; i < ntimes; i++)
            {
                matgen(a,lda,n,b,&norma);
                dgefa(a,lda,n,ipvt,&info );
            }
            elapsed = get_seconds() - start_time;
            atime[0][j] = (elapsed - tm2)/ntimes;

            start_time = get_seconds();
            for (i = 0; i < ntimes; i++)
            {
                dgesl(a,lda,n,ipvt,b,0);
            }
            elapsed = get_seconds() - start_time;

            atime[1][j] = elapsed/ntimes;
            total       = atime[0][j] + atime[1][j];
            atime[2][j] = total;
            atime[3][j] = ops/(1.0e6*total);
            atime[4][j] = 2.0/atime[3][j];
            atime[5][j] = total/cray;
            atime[3][6] = atime[3][6] + atime[3][j];
        }

        atime[3][6] = atime[3][6] / 5.0;


/************************************************************************
 *             Calculate overhead of executing matgen procedure         *
 ************************************************************************/

        start_time = get_seconds();
        for ( i = 0 ; i < loop ; i++)
        {
            matgen(aa,ldaa,n,b,&norma);    
        }
        elapsed = get_seconds() - start_time;
        overhead2 = elapsed;
        overhead2 = overhead2 / (double)loop;
        

/************************************************************************
 *                              Execute 5 passes                        *
 ************************************************************************/
              
        tm2 = ntimes * overhead2;
        atime[3][12] = 0;

        for (j=7 ; j<12 ; j++)
        {
            start_time = get_seconds();
            for (i = 0; i < ntimes; i++)
            {
                matgen(aa,ldaa,n,b,&norma);
                dgefa(aa,ldaa,n,ipvt,&info  );
            }
            elapsed = get_seconds() - start_time;
            atime[0][j] = (elapsed - tm2)/ntimes;
            
            start_time = get_seconds();
            for (i = 0; i < ntimes; i++)
            {
                dgesl(aa,ldaa,n,ipvt,b,0);
            }
            elapsed = get_seconds() - start_time;
            atime[1][j] = elapsed/ntimes;
            total       = atime[0][j] + atime[1][j];
            atime[2][j] = total;
            atime[3][j] = ops/(1.0e6*total);
            atime[4][j] = 2.0/atime[3][j];
            atime[5][j] = total/cray;
            atime[3][12] = atime[3][12] + atime[3][j];
        }

        atime[3][12] = atime[3][12] / 5.0; 

      mflops = atime[3][6];
      if (atime[3][12] < mflops) mflops = atime[3][12];
      return mflops;
}
     

/* We would like to declare a[][lda], but c does not allow it.  In this
function, references to a[i][j] are written a[lda*i+j].  */

void matgen (double a[], int lda, int n, double b[], double *norma)
{
        int init, i, j;

        init = 1325;
        *norma = 0.0;
        for (j = 0; j < n; j++) {
                for (i = 0; i < n; i++) {
                        init = 3125*init % 65536;
                        a[lda*j+i] = (init - 32768.0)/16384.0;                        
                        *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
                        
                        /* alternative for some compilers
                        if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
                        */
                }
        }
        for (i = 0; i < n; i++) {
          b[i] = 0.0;
        }
        for (j = 0; j < n; j++) {
                for (i = 0; i < n; i++) {
                        b[i] = b[i] + a[lda*j+i];
                }
        }
        return;
}


/*     gaussian elimination with partial pivoting       */

void dgefa(double a[], int lda, int n, int ipvt[], int *info)
{
   double t;
   int j,k,kp1,l,nm1;

        *info = 0;
        nm1 = n - 1;
        if (nm1 >=  0) {
                for (k = 0; k < nm1; k++) {
                        kp1 = k + 1;

                        /* find l = pivot index */

                        l = idamax(n-k,&a[lda*k+k],1) + k;
                        ipvt[k] = l;

                        /* zero pivot implies this column already 
                           triangularized */

                        if (a[lda*k+l] != ZERO) {

                                /* interchange if necessary */

                                if (l != k) {
                                        t = a[lda*k+l];
                                        a[lda*k+l] = a[lda*k+k];
                                        a[lda*k+k] = t; 
                                }

                                /* compute multipliers */

                                t = -ONE/a[lda*k+k];
                                dscal(n-(k+1),t,&a[lda*k+k+1],1);

                                /* row elimination with column indexing */

                                for (j = kp1; j < n; j++) {
                                        t = a[lda*j+l];
                                        if (l != k) {
                                                a[lda*j+l] = a[lda*j+k];
                                                a[lda*j+k] = t;
                                        }
                                        daxpy(n-(k+1),t,&a[lda*k+k+1],1,
                                              &a[lda*j+k+1],1);
                                } 
                        }
                        else { 
                                *info = k;
                        }
                } 
        }
        ipvt[n-1] = n-1;
        if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
        return;
}


void dgesl(double a[],int lda,int n,int ipvt[],double b[],int job )
{
        double t;
        int k,kb,l,nm1;

        nm1 = n - 1;
        if (job == 0) {

                /* job = 0 , solve  a * x = b
                   first solve  l*y = b         */

                if (nm1 >= 1) {
                        for (k = 0; k < nm1; k++) {
                                l = ipvt[k];
                                t = b[l];
                                if (l != k){ 
                                        b[l] = b[k];
                                        b[k] = t;
                                }       
                                daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
                        }
                } 

                /* now solve  u*x = y */

                for (kb = 0; kb < n; kb++) {
                    k = n - (kb + 1);
                    b[k] = b[k]/a[lda*k+k];
                    t = -b[k];
                    daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
                }
        }
        else { 

                /* job = nonzero, solve  trans(a) * x = b
                   first solve  trans(u)*y = b                  */

                for (k = 0; k < n; k++) {
                        t = ddot(k,&a[lda*k+0],1,&b[0],1);
                        b[k] = (b[k] - t)/a[lda*k+k];
                }

                /* now solve trans(l)*x = y     */

                if (nm1 >= 1) {
                        for (kb = 1; kb < nm1; kb++) {
                                k = n - (kb+1);
                                b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
                                l = ipvt[k];
                                if (l != k) {
                                        t = b[l];
                                        b[l] = b[k];
                                        b[k] = t;
                                }
                        }
                }
        }
        return;
}


/*
     constant times a vector plus a vector.
     jack dongarra, linpack, 3/11/78.
*/

void daxpy(int n, double da, double dx[], int incx, double dy[], int incy)
{
        int i,ix,iy,m;

        m = 0;

        if(n <= 0) return;
        if (da == ZERO) return;

        if(incx != 1 || incy != 1) {

                /* code for unequal increments or equal increments
                   not equal to 1                                       */

                ix = 0;
                iy = 0;
                if(incx < 0) ix = (-n+1)*incx;
                if(incy < 0)iy = (-n+1)*incy;
                for (i = 0;i < n; i++) {
                        dy[iy] = dy[iy] + da*dx[ix];
                        ix = ix + incx;
                        iy = iy + incy;
                     
                }
                return;
        }
        
        /* code for both increments equal to 1 */

        m = n % 4;
        if ( m != 0) {
                for (i = 0; i < m; i++) 
                        dy[i] = dy[i] + da*dx[i];
                        
                if (n < 4) return;
        }
        for (i = m; i < n; i = i + 4) {
                dy[i] = dy[i] + da*dx[i];
                dy[i+1] = dy[i+1] + da*dx[i+1];
                dy[i+2] = dy[i+2] + da*dx[i+2];
                dy[i+3] = dy[i+3] + da*dx[i+3];
                
        }

   return;
}

   
/*
     forms the dot product of two vectors.
     jack dongarra, linpack, 3/11/78.
*/

double ddot(int n, double dx[], int incx, double dy[], int incy)
{
        double dtemp;
        int i,ix,iy,m;

        m = 0;

        dtemp = ZERO;

        if(n <= 0) return(ZERO);

        if(incx != 1 || incy != 1) {

                /* code for unequal increments or equal increments
                   not equal to 1                                       */

                ix = 0;
                iy = 0;
                if (incx < 0) ix = (-n+1)*incx;
                if (incy < 0) iy = (-n+1)*incy;
                for (i = 0;i < n; i++) {
                        dtemp = dtemp + dx[ix]*dy[iy];
                        ix = ix + incx;
                        iy = iy + incy;
                       
                }
                return(dtemp);
        }

        /* code for both increments equal to 1 */

        m = n % 5;
        if (m != 0) {
                for (i = 0; i < m; i++)
                        dtemp = dtemp + dx[i]*dy[i];
                if (n < 5) return(dtemp);
        }
        for (i = m; i < n; i = i + 5) {
                dtemp = dtemp + dx[i]*dy[i] +
                dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
                dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
        }
        return(dtemp);
}


/*     

      scales a vector by a constant.
      jack dongarra, linpack, 3/11/78.
*/

void dscal(int n, double da, double dx[], int incx)
{
        int i,m,nincx;

        m = 0;

        if(n <= 0)return;
        if(incx != 1) {

                /* code for increment not equal to 1 */

                nincx = n*incx;
                for (i = 0; i < nincx; i = i + incx)
                        dx[i] = da*dx[i];
                        
                return;
        }

        /* code for increment equal to 1 */

        m = n % 5;
        if (m != 0) {
                for (i = 0; i < m; i++)
                        dx[i] = da*dx[i];
                if (n < 5) return;
        }
        for (i = m; i < n; i = i + 5){
                dx[i] = da*dx[i];
                dx[i+1] = da*dx[i+1];
                dx[i+2] = da*dx[i+2];
                dx[i+3] = da*dx[i+3];
                dx[i+4] = da*dx[i+4];
        }
}


/*
     finds the index of element having max. absolute value.
     jack dongarra, linpack, 3/11/78.
*/

int idamax(int n, double dx[], int incx)
{
        double dmax;
        int i, ix, itemp = 0;

        if( n < 1 ) return(-1);
        if(n ==1 ) return(0);
        if(incx != 1) {

                /* code for increment not equal to 1 */

                ix = 1;
                dmax = fabs((double)dx[0]);
                ix = ix + incx;
                for (i = 1; i < n; i++) {
                        if(fabs((double)dx[ix]) > dmax)  {
                                itemp = i;
                                dmax = fabs((double)dx[ix]);
                        }
                        ix = ix + incx;
                }
        }
        else {

                /* code for increment equal to 1 */

                itemp = 0;
                dmax = fabs((double)dx[0]);
                for (i = 1; i < n; i++) {
                        if(fabs((double)dx[i]) > dmax) {
                                itemp = i;
                                dmax = fabs((double)dx[i]);
                        }
                }
        }
        return (itemp);
}


/*
     estimate unit roundoff in quantities of size x.
*/

double epslon (double x)
{
        double a,b,c,eps;

        a = 4.0e0/3.0e0;
        eps = ZERO;
        while (eps == ZERO) {
                b = a - ONE;
                c = b + b + b;
                eps = fabs((double)(c-ONE));
        }
        return(eps*fabs((double)x));
}
 

/*
   purpose:
     multiply matrix m times vector x and add the result to vector y.

   parameters:

     n1 integer, number of elements in vector y, and number of rows in
         matrix m

     y double [n1], vector of length n1 to which is added 
         the product m*x

     n2 integer, number of elements in vector x, and number of columns
         in matrix m

     ldm integer, leading dimension of array m

     x double [n2], vector of length n2

     m double [ldm][n2], matrix of n1 rows and n2 columns

   We would like to declare m[][ldm], but c does not allow it.  
   In this function, references to m[i][j] are written m[ldm*i+j].

*/

void dmxpy (int n1, double y[], int n2, int ldm, double x[], double m[])
{
        int j,i,jmin;
        /* cleanup odd vector */

        j = n2 % 2;
        if (j >= 1) {
                j = j - 1;
                for (i = 0; i < n1; i++) 
                        y[i] = (y[i]) + x[j]*m[ldm*j+i];
        } 

        /* cleanup odd group of two vectors */

        j = n2 % 4;
        if (j >= 2) {
                j = j - 1;
                for (i = 0; i < n1; i++)
                        y[i] = ( (y[i])
                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
        } 

        /* cleanup odd group of four vectors */

        j = n2 % 8;
        if (j >= 4) {
                j = j - 1;
                for (i = 0; i < n1; i++)
                        y[i] = ((( (y[i])
                               + x[j-3]*m[ldm*(j-3)+i]) 
                               + x[j-2]*m[ldm*(j-2)+i])
                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
        } 

        /* cleanup odd group of eight vectors */

        j = n2 % 16;
        if (j >= 8) {
                j = j - 1;
                for (i = 0; i < n1; i++)
                        y[i] = ((((((( (y[i])
                               + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
                               + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
                               + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
                               + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
        } 
        
        /* main loop - groups of sixteen vectors */

        jmin = (n2%16)+16;
        for (j = jmin-1; j < n2; j = j + 16) {
                for (i = 0; i < n1; i++) 
                        y[i] = ((((((((((((((( (y[i])
                                + x[j-15]*m[ldm*(j-15)+i]) 
                                + x[j-14]*m[ldm*(j-14)+i])
                                + x[j-13]*m[ldm*(j-13)+i]) 
                                + x[j-12]*m[ldm*(j-12)+i])
                                + x[j-11]*m[ldm*(j-11)+i]) 
                                + x[j-10]*m[ldm*(j-10)+i])
                                + x[j- 9]*m[ldm*(j- 9)+i]) 
                                + x[j- 8]*m[ldm*(j- 8)+i])
                                + x[j- 7]*m[ldm*(j- 7)+i]) 
                                + x[j- 6]*m[ldm*(j- 6)+i])
                                + x[j- 5]*m[ldm*(j- 5)+i]) 
                                + x[j- 4]*m[ldm*(j- 4)+i])
                                + x[j- 3]*m[ldm*(j- 3)+i]) 
                                + x[j- 2]*m[ldm*(j- 2)+i])
                                + x[j- 1]*m[ldm*(j- 1)+i]) 
                                + x[j]   *m[ldm*j+i];
        }
        return;
} 


/********************************************************************************/

//  start disk I/O benchmark

void m_disk()
{
   char        header[100], filename[200];
   char        message[200];
   int         fid;

   m_loopy();                                                                    //  3.3

   strncpy0(filename,scratchfile,96);                                            //  test if scratch file OK   1.5
   strcat(filename,"-0");

   fid = open(filename,O_WRONLY+O_CREAT+O_TRUNC+O_SYNC,0600);
   if (fid <= 0) {
      snprintf(message,200," *** file error: %s \n %s",filename,strerror(errno));
      wprintz(message,0);
      return;
   }
   close(fid);

   snprintf(header,99,"\n Disk I/O throughput \n"
                        "  file: %s \n"
                        "  size: %d MB \n",filename,diskMB);
   wprintz(header,1);

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(disk_thread, &tid[ii]);
   }
}


//  thread for disk I/O benchmark

void * disk_thread(void *varg)
{
   char           filename[200], *buff = 0;
   char           message1[200], message2[20];
   int            th, fid = 0, ii, kk, wcc, rcc, err, recs, posn;
   int            flags;
   double         secs, time0;
   static double  IOPs[10], MBs[10];
   
   th = * ((int *) varg);

   strncpy0(filename,scratchfile,96);                                            //  make file name per thread
   strcat(filename,"-0");
   ii = strlen(filename);
   filename[ii-1] = '0' + th;

   for (ii = 0; ii < 4; ii++)                                                    //  four I/O record sizes   1.4
   {
      recs = long(diskMB*1024*1024 / (diskKB[ii]*1024));                         //  records to write and read
      
      err = posix_memalign((void **) &buff,1024,diskKB[ii]*1024);                //  allocate I/O buffer, 1024 aligned
      if (err) {
         wprintz(" *** memory allocation failure \n",1);                         //  2.9
         goto cleanup;
      }
      
      memset(buff,'a',diskKB[ii]*1024);                                          //  3.1
      for (kk = 0; kk < diskKB[ii]*1024; kk += 200)
         buff[kk] = '\n';

      //  sequential write (direct I/O, no cache)
      
      time0 = get_seconds();

      flags = O_WRONLY + O_CREAT + O_TRUNC + O_DIRECT;                           //  3.1
      fid = open(filename,flags,0600);
      if (fid <= 0) goto file_error;
      
      for (kk = 0; kk < recs; kk++)
      {
         wcc = write(fid,buff,diskKB[ii]*1024);
         if (wcc == -1) goto file_error;
         if (killThreads) goto cleanup;
      }

      syncfs(fid);                                                               //  flush buffers out   3.1

      err = close(fid);
      if (err) goto file_error;

      secs = get_seconds() - time0;
      IOPs[th] = recs / secs;
      MBs[th] = diskMB / secs;

      synch_threads(0);
      if (killThreads) goto cleanup;

      if (th == 0)
      {
         sprintf(message1," %5d KB   serial write  IO/sec",diskKB[ii]);
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.0f",IOPs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);

         sprintf(message1," %24s MB/sec","");
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.1f",MBs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }
      
      //  sequential read (direct, no cache)
      
      time0 = get_seconds();

      flags = O_RDONLY + O_DIRECT;
      fid = open(filename,flags);
      if (fid <= 0) goto file_error; 

      for (kk = 0; kk < recs; kk++)
      {
         rcc = read(fid,buff,diskKB[ii]*1024);
         if (rcc == -1) goto file_error;
         if (rcc == 0) break;
         if (killThreads) goto cleanup;
      }

      err = close(fid);
      if (err) goto file_error;
      fid = 0;
      
      secs = get_seconds() - time0;
      IOPs[th] = recs / secs;
      MBs[th] = diskMB / secs;

      synch_threads(0);
      if (killThreads) goto cleanup;

      if (th == 0)
      {
         sprintf(message1," %11s serial read  IO/sec","");
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.0f",IOPs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);

         sprintf(message1," %24s MB/sec","");
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.1f",MBs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }

      //  random read (direct, no cache)                                         //  1.1

      time0 = get_seconds();

      flags = O_RDONLY + O_DIRECT;                                               //  3.1
      fid = open(filename,flags);
      if (fid <= 0) goto file_error;

      for (kk = 0; kk < recs; kk++)
      {
         posn = (recs - 1) * drand48();
         posn = posn * diskKB[ii] * 1024;
         lseek(fid,posn,SEEK_SET);
         rcc = read(fid,buff,diskKB[ii]*1024);
         if (rcc == -1) goto file_error;
         if (rcc == 0) break;
         if (killThreads) goto cleanup;
      }

      err = close(fid);
      if (err) goto file_error;
      fid = 0;
      
      secs = get_seconds() - time0;
      IOPs[th] = recs / secs;
      MBs[th] = diskMB / secs;

      synch_threads(0);
      if (killThreads) goto cleanup;

      if (th == 0)
      {
         sprintf(message1," %11s random read  IO/sec","");
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.0f",IOPs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);

         sprintf(message1," %24s MB/sec","");
         for (kk = 0; kk < Nthreads; kk++) {
            sprintf(message2," %6.1f",MBs[kk]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }

      free(buff);
      buff = 0;
   }
   
   goto cleanup;

file_error:
   snprintf(message1,200," *** file error: %s \n",strerror(errno));              //  2.9
   wprintz(message1,1);

cleanup:
   if (fid > 0) close(fid);
   if (buff) free(buff);
   remove(filename);
   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


/********************************************************************************/

//  start rpm benchmark

void m_rpm()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(rpm_thread, &tid[ii]);
   }
}


/********************************************************************************/

//  thread for RPM benchmark

void * rpm_thread(void *varg)
{
   void rpm_mops(int runsecs, double &mops);

   char           message1[100], message2[20];
   static double  m100 = 0, mops, mcpu[10];

   int th = * ((int *) varg);

   if (th == 0) 
   {
      wprintz("\n CPU speed relative to 100\% \n",1);

      sprintf(message1,"  initial CPU temp: %dC \n",coretemp());
      wprintz(message1,0);

      wprintz("  calibrating with one thread ...\n",0);

      for (int nn = 0; nn < 5; nn++)
      {
         rampCPU();
         rpm_mops(1.0,mops);
         if (mops > m100) m100 = mops;                                           //  baseline 100% speed value
      }
   }
   
   while (! killThreads)
   {
      synch_threads(0);
      rpm_mops(runtime,mops);
      if (mops > m100) m100 = mops;
      mcpu[th] = 99.9 * mops / m100;
      synch_threads(0);

      if (th == 0 && ! killThreads)
      {
         sprintf(message1,"  CPU speed: ");
         for (int ii = 0; ii < Nthreads; ii++) {
            sprintf(message2,"%5.1f ",mcpu[ii]);                                 //  CPU speed:  99.9  99.9 ...
            strcat(message1,message2);
         }

         sprintf(message2,"  temp: %dC \n",coretemp());                          //  temp: 55C
         strcat(message1,message2);
         wprintz(message1,0);
      }
   }

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}


//  rpm speed computation

void rpm_mops(int runsecs, double &mops)
{
   int         ii, jj, kk;
   double      loop_time, secs, time0, count, dd;

   secs = count = 0.0;
   time0 = get_seconds();

   while (secs < runsecs && ! killThreads)
   {
      for (ii = 0; ii < 100000; ii++)
      {
         jj = kk = 13;
         dd = 12345.12345;
         jj = jj + kk + 27;
         dd = dd * dd * dd;
         kk = kk + jj + 89;
         dd = dd / 345.678;
         jj = kk - 3 * jj;
         count++;
      }
      secs = get_seconds() - time0;
   }
   
   loop_time = secs / count;
   mops = 20.0 / loop_time / 1000000.0;
   return;
}


/********************************************************************************/

//  start memory test function

void m_memtest()
{
   m_loopy();                                                                    //  3.3

   synch_threads(Nthreads);

   for (int ii = 0; ii < Nthreads; ii++)
   {
      zadd_locked(NTbusy,+1);
      start_detached_thread(memtest_thread, &tid[ii]);
   }
}


//  memory test thread function
//  Allocate blocks of memory, test intensively, release, repeat.

void * memtest_thread(void *varg)
{
   char     header[100];
   int      th, ii, jj;
   uint     blocksize, MB = 1024 * 1024;                                         //  up to 4095 MB   1.2
   void     *memloc[100];
   int      rnum, *pnum, pass = 0, errors[10];
   uint16   xsubi[3];
   char     message1[100], message2[20];
   
   th = * ((int *) varg);
   if (th == 0) {
      snprintf(header,99,"\n memory test: %d MB, %d threads \n",memMB,Nthreads);
      wprintz(header,1);
   }
   
   for (ii = 0; ii < Nthreads; ii++) errors[ii] = 0;
   
   blocksize = memMB * MB / 100;
   
   for (ii = 0; ii < 100; ii++) memloc[ii] = 0;
   
   while (! killThreads)
   {
      for (ii = 0; ii < 100; ii++)
      {
         zsleep(0.01);                                                           //  avoid 100% cpu hog
         if (killThreads) break;

         memloc[ii] = (char *) zmalloc(blocksize+4);                             //  allocate block

         xsubi[0] = th;
         xsubi[1] = ii;
         xsubi[2] = pass;
         rnum = nrand48(xsubi);

         pnum = (int *) memloc[ii];                                              //  fill with random value
         for (jj = 0; jj < (int) blocksize/4; jj++)
            pnum[jj] = rnum;
      }
      
      for (ii = 0; ii < 100; ii++)
      {
         zsleep(0.01);
         if (killThreads) break;

         xsubi[0] = th;
         xsubi[1] = ii;
         xsubi[2] = pass;
         rnum = nrand48(xsubi);

         pnum = (int *) memloc[ii];                                              //  check random value
         for (jj = 0; jj < (int) blocksize/4; jj++)
            if (pnum[jj] != rnum) errors[th]++;

         zfree(memloc[ii]);                                                      //  release block
         memloc[ii] = 0;
      }

      pass++;
      synch_threads(0);
      if (killThreads) break;
      
      if (th == 0)
      {
         sprintf(message1,"  pass: %d  errors: ",pass);
         for (ii = 0; ii < Nthreads; ii++) {
            sprintf(message2," %3d",errors[th]);
            strcat(message1,message2);
         }
         strcat(message1,"\n");
         wprintz(message1,0);
      }
   }

   for (ii = 0; ii < 100; ii++)
      if (memloc[ii]) zfree(memloc[ii]);

   zadd_locked(NTbusy,-1);
   if (! NTbusy) {
      wprintz("\n ready\n",0);
      wprintz("\n",0);
   }
   return 0;       
}



