Code snippet from an evening spent playing with Parallel.For to speed up image processing on an 8 core PC

static long[] averageTicks = new long[100];
 static int currentOffset = 0;
 byte[] byteArray;
 public byte[] GetOpenGLBuffer()
 {
 int powWidth = Utility.GetNextPowerOfTwo(width);
 int powHeight = Utility.GetNextPowerOfTwo(height);
 if(byteArray == null)
 byteArray = new byte[powWidth * 4 * powHeight];

Stopwatch sw = new Stopwatch();
 sw.Start();

// ~17500 ticks
 //int offset = 0;
 //int floatOffset = 0;
 //for (int y = 0; y < height; y++)
 //{
 // int offset2 = offset;
 // for (int x = 0; x < width * 4; ++x)
 // {
 // byteArray[offset2++] = (byte)(left[floatOffset++] * 255.0f);
 // }
 // offset += powWidth * 4;
 //}

// ~4400 ticks (6500 if I do all four casts)
 Parallel.For(0, height, y =>
 {
 int offset = powWidth * 4 * y;
 int floatOffset = width * 4 * y;
 for (int x = 0; x < width; x++)
 {
 byteArray[offset] = (byte)(left[floatOffset] * 255.0f);
 byteArray[offset+1] = (byte)(left[floatOffset+1] * 255.0f);
 byteArray[offset+2] = (byte)(left[floatOffset+2] * 255.0f);
 offset+=4;
 floatOffset += 4;
 }
 });

averageTicks[currentOffset] = sw.ElapsedTicks;
 long av = 0;
 for (int i = 0; i < 100; ++i)
 av += averageTicks[i];
 av /= 100;
 Console.WriteLine(av);

++currentOffset;
 if (currentOffset == 100)
 currentOffset = 0;

return byteArray;

}