Thursday, May 26, 2011

RVM Thread Pool

package org.jikesrvm.compilers.opt;

import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;


import org.jikesrvm.scheduler.SystemThread;
import org.vmmagic.pragma.NonMoving;

@NonMoving
public class OptCompilerThread
{
    private final int nThreads;
    private final PoolWorker[] threads;
    private final OptCompilerBlockingQueue queue;
    public OptCompilerThread(int nThreads)
    {
        this.nThreads = nThreads;
        queue = new OptCompilerBlockingQueue();
        threads = new PoolWorker[nThreads];

        for (int i=0; i<nThreads; i++) {
            threads[i] = new PoolWorker(i);
            threads[i].start();
        }
    }

  
   

    public void execute(Runnable r) throws InterruptedException {
        synchronized(queue) {
          
           
            queue.add(r);
            queue.notify();
        }
    }
   
   
    public synchronized void shutdown(){
        for(int i=0; i<nThreads; i++){
            threads[i].stop(new IllegalStateException("ThreadPool is stopped"));
           
        }
      }
   

    @NonMoving
    private class PoolWorker extends SystemThread {
    private final ArrayBlockingQueue<Runnable> handoffBox = new ArrayBlockingQueue<Runnable>(1);

    
       
       
        protected PoolWorker(int ThreadCountN) {
            super("CompilerThread" + ThreadCountN);
        }

        @Override
        public void run() {
            Runnable r;
   
            while (true) {
                synchronized(queue) {
                    while (queue.isEmpty()) {
                        try
                        {
                            queue.wait();
                           
                        }
                        catch (InterruptedException ignored)
                        {
                           
                        }
                    }

                    try {
                        r = (Runnable) queue.remove();
                        handoffBox.add(r);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
               
                synchronized(handoffBox)
                {
                    while(handoffBox.isEmpty()){
                        try
                        {
       
                            handoffBox.wait();
                           
                        }
                        catch (InterruptedException ignored)
                        {
                        }
                    }
                    r = (Runnable) handoffBox.remove();
                   
                   
                }

                // If we don't catch RuntimeException,
                // the pool could leak threads
                try {
                    r.run();
                }
                catch (RuntimeException e) {
                    // You might want to log something here
                }
               
             
            
               
       
               
            }
        }
    }
}




class OptCompilerBlockingQueue {

      private List<Object> queue = new LinkedList<Object>();
     
      private int  limit;
      private boolean Dynamic = false;
     

      public OptCompilerBlockingQueue(int limit){
        this.limit = limit;
      }
     
      public synchronized boolean isEmpty()
        {
            if (queue.isEmpty())
                return true;
            else
                return false;   
        }

    public OptCompilerBlockingQueue()
      {
          Dynamic = true;
      }


      public synchronized void add(Object item)
      throws InterruptedException  {
        if(Dynamic == false)
        {
            while(queue.size() == limit) {
                wait();
            }
        }
          queue.add(item);
      }


      public synchronized Object remove()
      throws InterruptedException{
          while(queue.size() == 0){
              wait();
              }
        return  queue.remove(0);
      }
}
------------------------------------------------------------------------------------------------

package org.jikesrvm.compilers.opt;


import org.jikesrvm.compilers.opt.ir.Register;
import org.vmmagic.pragma.NonMoving;

@NonMoving
public class ParallelOptCompiler implements ParallelOptCompilerPass {
   

   
    public Runnable runPassOnSingleRegister(final Register reg) {
        return new Runnable(){

            public void run() {
                reg.putSSA((reg.defList != null && reg.defList.getNext() == null));
            }
       
        };
       
    }
}


-----------------------------------------------------------------------------------------


package org.jikesrvm.compilers.opt;

import org.jikesrvm.compilers.opt.ir.Register;

public interface ParallelOptCompilerPass {
    Runnable runPassOnSingleRegister(Register reg);
}

---------------------------------------------------------------------------------------


        OptCompilerThread thread = new OptCompilerThread(2);
        ParallelOptCompiler pass = new ParallelOptCompiler();
        for (Register reg = ir.regpool.getFirstSymbolicRegister(); reg != null; reg = reg.getNext()) {
            try {
                thread.execute(pass.runPassOnSingleRegister(reg));
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
       
        thread.shutdown();

Monday, May 16, 2011

Non Multithreaded Vanilla Compiler Optimization Times

To help create a starting point for my work I started with profiling the execution times of each optimization in Simple.java[1]

Optimizations (in Simple.java)
  1. // Compute defList, useList, useCount fields for each register.
        DefUse.computeDU(ir);
  2.    // Recompute isSSA flags
        DefUse.recomputeSSA(ir);
  3. / Simple copy propagation.
        // This pass incrementally updates the register list.
        copyPropagation(ir);
  4.  // Simple type propagation.
        // This pass uses the register list, but doesn't modify it.
       if (typeProp) {
         typePropagation(ir);
        }
  5. // Perform simple bounds-check and arraylength elimination.
        // This pass incrementally updates the register list
        if (foldChecks) {
          arrayPropagation(ir);
        }
  6. // Simple dead code elimination.
        // This pass incrementally updates the register list
        eliminateDeadInstructions(ir);
  7. // constant folding
        // This pass usually doesn't modify the DU, but
        // if it does it will recompute it.
        foldConstants(ir);
        // Simple local expression folding respecting DU
        if (ir.options.LOCAL_EXPRESSION_FOLDING && ExpressionFolding.performLocal(ir)) {
          // constant folding again
          foldConstants(ir);
        }
  8. // Try to remove conditional branches with constant operands
        // If it actually constant folds a branch,
        // this pass will recompute the DU
        if (foldBranches) {
          simplifyConstantBranches(ir);
        }
  9. // Should we sort commutative use operands
        if (sortRegisters) {
          sortCommutativeRegisterUses(ir);
        }

Next We add Timer Code, Make a Method Size API call, Make A Method Name API Call

Timer:
startTime = System.nanoTime(); 
DefUse.computeDU(ir);
EndTime = System.nanoTime();

Method Name and Method Size:
final int methodinstructionSize = ir.numberInstructions();

 try{ //Debug Execution Time output
        // Create file
        FileWriter fstream = new FileWriter("computeDUmethodExecutionTime.txt",true);
            BufferedWriter out = new BufferedWriter(fstream);
        out.write("\nExecution Time: " + (EndTime - startTime) + " nano Seconds" + " Instructs: " + methodinstructionSize +  " Method: " + ir.getMethod().getName().toString());
        //Close the output stream
        out.close();
        }catch (Exception e){//Catch exception if any
          System.err.println("Error: " + e.getMessage());
        } 

./rvm -classpath dacapo-2006-10-MR2.jar Harness fop

We are left with ASCII files for each individual optimization containing data on Method Size, Method Execution Time, and Name

Execution Time: 21200 nano Seconds Instructs: 37 Method: stringToGlyph
Execution Time: 13520 nano Seconds Instructs: 53 Method: scanDigits
Execution Time: 17480 nano Seconds Instructs: 72 Method: getExplicit
Execution Time: 108801 nano Seconds Instructs: 508 Method: getShorthand


Graphs (Size vs Time)

Array Propagation

computeDU
 
 copyPropagation

 eliminateDeadInstructions

recomputeSSA
 
 sortRegisters

 typePropagation


[1] http://jikesrvm.svn.sourceforge.net/viewvc/jikesrvm/rvmroot/trunk/rvm/src/org/jikesrvm/compilers/opt/Simple.java?revision=16061&view=markup

Saturday, May 7, 2011

Jikes RVM parallel boot image creation execution time results


There are two types of problems "compiling for a multiprocessor" and "compiling on a multiprocessor"[4] I am focusing on compiling on a multiprocessor which can theoretically cut compile time down.
More specifically for the Jikes RVM there are two areas of the compiler that need to be multithreaded to solve the "compiling on a multiprocessor" problem.
We need to multithread two areas The Runtime Compiler and The Boot Image Writer.[5]
Currently Multithreading of the Boot Image compilation process has already been done and here are the before and after results on a dual core processor (two hardware threads).



System INFO
CPU: Intel(R) Core(TM)2 Duo CPU E8300 @ 2.83GHz
Memory: 2GB
OS: Fedora 12 32bit


Here are some before and after test results utilizing Jikes RVMS existing multithreaded Boot Image compilation[5]:
ant -Dconfig.name=production
BUILD SUCCESSFUL
Total time: 2 minutes 55 seconds
---------------------------------
ant -Dconfig.name=production
BUILD SUCCESSFUL
Total time: 2 minutes 57 seconds
---------------------------------
ant -Dconfig.name=production -Dbootimage.threads=2
BUILD SUCCESSFUL
Total time: 2 minutes 39 seconds
---------------------------------
ant -Dconfig.name=production -Dbootimage.threads=2
BUILD SUCCESSFUL
Total time: 2 minutes 38 seconds

As you can see with one thread we have a compile time of 2 minutes and 55 seconds and with two threads we have and execution time of 2 minutes and 39 seconds.
My research goal is to cut down the execution time down of the Jikes RVM compilation.

Here is how it works in the code:

First we start with our command line option of:
ant -Dconfig.name=production -Dbootimage.threads=2
These command line options are pulled in with the help of CommandLineArgs.java[1] and possibly a few other source files.
This changes the value of <property name="bootimage.threads" value="1"/> in the build.xml file[2] which allows for multithreaded(parallel compilation) boot image compilation.
Delving further into the source code these options set a value in the BootImageWriter.java[3] allowing for multithreaded compilation of the bootimage.

CODE:

The flag -Dbootimage.threads=2 sets -numThreads=2 in the BootImageWrite.java[3]

Command: ant -v -Dconfig.name=production -Dbootimage.threads=2

Output:

[java] '-X:bc:O2'
[java] '-littleEndian'
[java] '-da'
[java] '0x60000000'
[java] '-ca'
[java] '0x64000000'
[java] '-ra'
[java] '0x67000000'
[java] '-numThreads=2'
[java] '-classlib'
[java] 'Classpath'
[java]
[java] The ' characters around the executable and arguments are
[java] not part of the command.
[java] BootImageWriter: compiler arg: O2

Note: See the correlation between -Dbootimage.threads=N and -numThreads=N
-Dbootimage.threads=N sets the value of -numThreads=N




Code below contained in source file BootImageWrite.java[3]
 
* -numThreads=N number of parallel compilation threads we should create


if (args[i].startsWith("-numThreads=")) {
numThreads = Integer.parseInt(args[i].substring(12));
if (numThreads < 1) {
fail("numThreads must be a positive number, value supplied: "+ numThreads);
}
continue;
}




if (verbose >= 1) say(" compiling with " + numThreads + " threads");
ExecutorService threadPool = Executors.newFixedThreadPool(numThreads);
for (RVMType type: bootImageTypes.values()) {
threadPool.execute(new BootImageWorker(type));
}
threadPool.shutdown();
try {
while(!threadPool.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS)) {
say("Compilation really shouldn't take this long");
}
} catch (InterruptedException e){
throw new Error("Build interrupted", e);
}
if (BootImageWorker.instantiationFailed) {
throw new Error("Error during instantiaion");
}


Full System Information:

command: cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 23
model name : Intel(R) Core(TM)2 Duo CPU E8300 @ 2.83GHz
stepping : 6
cpu MHz : 2124.000
cache size : 6144 KB
physical id : 0
siblings : 2
core id : 0
cpu cores : 2
apicid : 0
initial apicid : 0
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm tpr_shadow vnmi flexpriority
bogomips : 5666.99
clflush size : 64
power management:
processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 23
model name : Intel(R) Core(TM)2 Duo CPU E8300 @ 2.83GHz
stepping : 6
cpu MHz : 2124.000
cache size : 6144 KB
physical id : 0
siblings : 2
core id : 1
cpu cores : 2
apicid : 1
initial apicid : 1
fdiv_bug : no
hlt_bug : no
f00f_bug : no
coma_bug : no
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe nx lm constant_tsc arch_perfmon pebs bts pni dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm tpr_shadow vnmi flexpriority
bogomips : 5666.33
clflush size : 64
power management:

command: cat /proc/meminfo
MemTotal: 2060128 kB
MemFree: 357516 kB
Buffers: 60876 kB
Cached: 501892 kB
SwapCached: 4344 kB
Active: 841384 kB
Inactive: 620044 kB
Active(anon): 478480 kB
Inactive(anon): 433076 kB
Active(file): 362904 kB
Inactive(file): 186968 kB
Unevictable: 0 kB
Mlocked: 0 kB
HighTotal: 1188744 kB
HighFree: 59396 kB
LowTotal: 871384 kB
LowFree: 298120 kB
SwapTotal: 4128760 kB
SwapFree: 4064148 kB
Dirty: 128 kB
Writeback: 0 kB
AnonPages: 894768 kB
Mapped: 55872 kB
Slab: 114072 kB
SReclaimable: 83344 kB
SUnreclaim: 30728 kB
PageTables: 9800 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 5158824 kB
Committed_AS: 2058564 kB
VmallocTotal: 122880 kB
VmallocUsed: 78384 kB
VmallocChunk: 28260 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
DirectMap4k: 8184 kB
DirectMap2M: 899072 kB

Command: uname -a
Linux localhost.localdomain 2.6.31.5-127.fc12.i686.PAE #1 SMP Sat Nov 7 21:25:57 EST 2009 i686 i686 i386 GNU/Linux 


[5]ftp://ftp.cs.man.ac.uk/pub/apt/theses/ChristosKotselidis_MSc.pdf%20