Python slicing, rebinning and indexing
25/Nov 2011
During my master thesis I had to manage a lot of (different) data from GIF and GIF2 projects, Millimillennium, Millenium and Millennium 2 simulations and so on. Sometimes there were the need to sort, divide or rearrange these dataset.
Here some examples.
Sort the Millennium 2 data
I developed this script to sort the Millennium 2 data in the $x$ coordinate. The dataset was composed of 512 hdf5 files and I would like to create 1000 files of 100 kpc/h each, taking particles for all the original files.
#!/bin/env python
import time
import numpy as np
import tables as tb
This is the usual modules importing. tables
is the module provided by PyTables to manage, in a great way, hdf5 file under Python.
t_glob = time.time()
Timing is always good!!!
# This create the first "5 Mpc/h" file, every loop operate on 100 kpc/h of data
for i in range(0,50):
t = time.time()
i_dist = i/10.0
j_dist = i_dist + 0.1
print "Starting with limits [Mpc/h]", i_dist, j_dist
temp2 = np.array([])
temp2.shape = (0,3)
# This loop open each of the 512 original files
# selecting the particles inside the limits
# and stacking them into the array that will
# be saved in the new file
for j in range(0,512):
filename = '../../hdf5/data_'+str(j)
print "Open ", filename
h5 = tb.openFile(filename, 'r')
original = h5.root.data.read()
h5.close()
temp = original[original[:,0]>i_dist]
if temp.size < 3:
temp3 = np.array([])
temp3.shape=(0,3)
else:
temp3 = temp[temp[:,0]<=j_dist]
try:
temp2 = np.vstack((temp2,temp3))
except:
print "Error in stacking"
print "temp2 dimensions ", temp2.shape
print "temp3 dimensions ", temp3.shape
exit()
tt=time.time()
# Sorting the array in the x direction
temp2[temp2[:,0].argsort(),]
print "Time for argsort ", time.time()-tt
# Writing data to file
destname = '../../hdf5_sorted/mill2sort-'+str(i)
print "Writing ", destname
dest = tb.openFile(destname, 'w')
dest.createArray(dest.root, 'data', temp2, title='data')
dest.flush()
dest.close()
print "Time for loop ", i, " is ", time.time()-t
print "Done in ",time.time()-t_glob, "seconds"
Rebin the sorted Millennium 2 data
After I have sorted the data, we need to rebin the dataset because it doesn’t fit into our machine memory.
This script make use of the parallel job manager to break the 1000 files of the sorted dataset into 5000 files each containing 20 kpc/h of data, sorted in the $x$ direction.
#! /usr/bin/env python
import sys, os
import numpy as np
import tables as tb
from subprocess import Popen, PIPE
from multiprocessing import Process, Queue
#############################################
sub_slice_dim = 0.02 #Mpc/h
sub_slice_num = 5 # for each of the original slice
n_procs = 5
tot_mill_slice = 999 # number of original slices
############################################
def slicing(i, j, sub_slice_dim, sub_slice_num):
"""Given a mill2 slice it creates 5 subslices.
"""
start_dim = j\*sub_slice_dim
stop_dim = start_dim + sub_slice_dim
print "Loop ", j, " of ", sub_slice_num , " between ", start_dim, " and ", stop_dim, " in slice ", i
pos_greater = mill_slice[start_dim < mill_slice[:,0]]
pos = pos_greater[pos_greater[:,0] <= stop_dim]
h5=tb.openFile('/home/ziosi/mill2_data/hdf5_sorted_rebinned/mill2sort-'+str(i*5+j)+'.h5', 'w')
h5.createArray(h5.root, 'data', pos, title='mill2_sorted_rebinned_pos')
h5.flush()
h5.close()
def breakmill(in_queue):
"""Bring a slice from the queue and start the function to
sub_slice it.
"""
while in_queue.qsize != 0:
ii = in_queue.get()
i = ii[0]
sub_slice_dim = ii[1]
sub_slice_num = ii[2]
original_slice = "/home/ziosi/mill2_data/hdf5_sorted/mill2sort-"+str(i)
mill_slice = tb.openFile(original_slice, 'r')
slice_data = mill_slice.root.data.read()
slice_min = np.amin(slice_data[:,0])
for j in xrange(5):
slicing(i, j, sub_slice, slice_dim, sub_slice_num)
# Create the queues.
in_queue = Queue()
out_queue = Queue()
# Fill the input queue.
try:
for i in xrange(tot_mill_slice):
in_queue.put([i, sub_slice_dim, sub_slice_num])
except:
print "Input queue not filled, exit!"
sys.exit(1)
procs = []
# Create the processes.
try:
for i in range(n_procs):
print "Process creation loop ", i
procs.append(Process(target=breakmill, args=(in_queue, out_queue)))
except:
print "Creating processes not complete, exit..."
sys.exit(1)
# Start the processes.
try:
for i in procs:
i.start()
except:
print "Start processes not complete, exit...";
sys.exit(1)
# Check for processes status.
for i in procs:
print "Process ", i," @ " , i.pid, " is ", status(i)
print "Done."
Indexing data
We also need to index a data file to can retrieve only a slice from the entire file. We did it using the following three elements:
flags = np.arange(0, 110, 0.1)
is the array containing the markers for the distances we want to index. This means that if we want to index distances from 0 to 10 every 1, we get: 0,1,2,3,4,5,6,7,8,9,10, so we can select, for example, the element between 0 and 1 without load the entire list.
start = indexes[np.maximum(np.searchsorted(indexes[:,0], xstart, side='left')-1, 0), 1]
contains the index of the cell in the array that contains the first element we want
stop = indexes[np.minimum(np.searchsorted(indexes[:+1], xstop, side='right')+1, indexes.shape[0]-1), 2]
contains the index of the cell in the array that contains the first element we want.
indexes = np.hstack((flags.reshape((flags.shape[0], 1)), start.reshape((start.shape[0], 1)), end.reshape((end.shape[0], 1))))
is the $(n,3)$ array with the flag, the start and the stop indexes. It is saved into the data file.