Module TIEGCM_Statistics.Manager
This module executes the statistical calculation.
It reads all the TIE-GCM NetCDF files and for each of them creates a computer process to process it.
Each process creates temporary binary files containing the values which fall into each bin.
After all processes are finished the module calculates percentiles for each bin.
Expand source code
"""
This module executes the statistical calculation.
It reads all the TIE-GCM NetCDF files and for each of them creates a computer process to process it.
Each process creates temporary binary files containing the values which fall into each bin.
After all processes are finished the module calculates percentiles for each bin.
"""
# local imports
import Data
# system imports
import netCDF4
from netCDF4 import Dataset
import os
import datetime
import time
import glob
import shutil
import math
import numpy as np
import multiprocessing
from pathlib import Path
import random
from array import array
NUM_OF_PROCESSORS = 14
TMP_FOLDER = "./results/tmp/" "/media/balukid/STATStmp/"
USE_WEIGHTED_AVERAGE = False
global theResultFile_folder
global theResultFile_simplename
def StartCalculating( NetCDF_files_path, ResultFilename, TypeOfCalculation, TmpFilesPath="/media/balukid/STATStmp/" ):
'''
This function manages the statistical calulation:
- For each TIE-GCM netCDF file it spawns a process which will store its results into temporary files at its own folder. Many procecess are utilised in order to accelerate the calculation by leveraging multiple cores.
- After all processes finish, this function merges all temporary files, calculates percentiles etc for each bin and creates a results netCDF file.
This function should be called after the Data.setDataParams() function has initialized the bin ranges.
Args:
NetCDF_files_path (string): The path where all the TIE-GCM netCDF files are stored. In wildcard format. Example: "./data/*/*.nc".
ResultFilename (string): The filename where the final calculation results will be stored.
TypeOfCalculation (string): The variable which upon which the calculation will be applied. See Data.setDataParams() for more.
TmpFilesPath (string): The path where the temporary files will be stores
'''
global TMP_FOLDER
global theResultFile_folder
global theResultFile_simplename
theResultFile_folder = ResultFilename[ 0 : ResultFilename.rfind('/') ]
theResultFile_simplename = ResultFilename[ ResultFilename.rfind('/')+1 : ][0:-3]
if not os.path.exists(theResultFile_folder+"/"+theResultFile_simplename): os.makedirs(theResultFile_folder+"/"+theResultFile_simplename)
startSecs = time.time()
print( "START", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S") )
TMP_FOLDER = TmpFilesPath
Allprocesses = list()
AllCDFfiles = sorted( glob.glob( NetCDF_files_path, recursive=True ) )
print( "I will calculate '" + TypeOfCalculation + "' on", len(AllCDFfiles), "files:\n ", NetCDF_files_path, "\n" )
print( "Results will be stored in '" + ResultFilename + "'\n" )
n = 0
for CDF_file in AllCDFfiles:
n += 1
Data.Progress = int( 100 * n/221)
# spawn new process
P = multiprocessing.Process(target=PROC_StatsCalculator, args=(n,CDF_file,TypeOfCalculation))
Allprocesses.append(P)
P.start()
pause_spawning = True
while pause_spawning:
Num_of_alive_processes = 0
for P in Allprocesses:
if P.is_alive():
Num_of_alive_processes += 1
if Num_of_alive_processes >= NUM_OF_PROCESSORS:
pause_spawning = True
time.sleep(12)
else:
pause_spawning = False
# wait for all processes to terminate
for T in Allprocesses: T.join()
# every process creates a partial file, merge all of them into one
print( "Merging partial data files and calculating result values...", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"))
ResultBuckets = Data.init_ResultDataStructure()
NumOfBins = len(Data.KPsequence) * len(Data.ALTsequence) * len(Data.LATsequence) * len(Data.MLTsequence)
CurrBinNum = 0
print( "Data.KPsequence: ", Data.KPsequence)
print( "Data.ALTsequence", Data.ALTsequence)
print( "Data.LATsequence", Data.LATsequence)
print( "Data.MLTsequence", Data.MLTsequence)
for aKP in Data.KPsequence:
for aMLT in Data.MLTsequence:
RegionHits = 0
for anALT in Data.ALTsequence:
for aLat in Data.LATsequence:
CurrBinNum += 1
Data.Progress = int( 100 * CurrBinNum/NumOfBins )
AllBinValues = list()
All_partialData_folders = sorted( glob.glob( TMP_FOLDER+"proc*", recursive=False ) )
for partialDataFolder in All_partialData_folders: # read all partial files for this bin
partialDataFolder = partialDataFolder + "/"
if os.path.isdir(partialDataFolder)==False:
continue
partialDataFilename = partialDataFolder + str(aKP)+"_"+str(anALT)+"_"+str(aLat)+"_"+str(aMLT)+".dat"
if os.path.exists(partialDataFilename) == False: # no hits for this bin from this process
continue
f = open(partialDataFilename, "rb")
float_array = array('d')
float_array.frombytes(f.read())
AllBinValues += float_array.tolist()
f.close()
print("BIN", "Kp"+str(aKP), "Alt"+str(anALT), "Lat"+str(aLat), "MLT"+str(aMLT), "", len(AllBinValues), "items" )
RegionHits += len(AllBinValues)
if len(AllBinValues) > 0:
ResultBuckets[aKP, anALT, aLat, aMLT, "Sum"] = np.sum(AllBinValues)
ResultBuckets[aKP, anALT, aLat, aMLT, "Len"] = len(AllBinValues)
ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile10"] = np.percentile(AllBinValues, 10)
ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile25"] = np.percentile(AllBinValues, 25)
ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile50"] = np.percentile(AllBinValues, 50)
ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile75"] = np.percentile(AllBinValues, 75)
ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile90"] = np.percentile(AllBinValues, 90)
ResultBuckets[aKP, anALT, aLat, aMLT, "Variance"] = np.var(AllBinValues)
ResultBuckets[aKP, anALT, aLat, aMLT, "Minimum"] = np.nanmin(AllBinValues)
ResultBuckets[aKP, anALT, aLat, aMLT, "Maximum"] = np.nanmax(AllBinValues)
# calculate distribution
if Data.DistributionNumOfSlots > 0:
histo_values, histo_ranges = np.histogram(AllBinValues, Data.DistributionNumOfSlots, (0, 0.0000001))
for i in range(0, Data.DistributionNumOfSlots):
ResultBuckets[aKP, anALT, aLat, aMLT, "Distribution"][i] = histo_values[i]
print( "REGION:", "Kp"+str(aKP), "MLT"+str(aMLT), ":", RegionHits, "measurements." )
print("BIN", "Kp"+str(aKP), "Alt"+str(anALT), "Lat"+str(aLat), "MLT"+str(aMLT), "", len(AllBinValues), "items" )
if "Ohmic" in TypeOfCalculation:
Data.WriteResultsToCDF(ResultBuckets, ResultFilename, "Joule Heating", "W/m3")
elif "SIGMA_PED" in TypeOfCalculation:
Data.WriteResultsToCDF(ResultBuckets, ResultFilename, "Pedersen Conductivity", "S/m")
else:
Data.WriteResultsToCDF(ResultBuckets, ResultFilename, TypeOfCalculation, "")
# DO NOT DELL PARTIAL FILES - THEY CAN BE USED TO CONTINUE THE CALCULATION AFTER AN INTERMEDIATE HALT
#try: # delete temporary files, which contain all values for each bin
# shutil.rmtree( TMP_FOLDER )
#except:
# pass
finishSecs = time.time()
print( "FINISH", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"), " (", finishSecs-startSecs, "sec )")
def PROC_StatsCalculator(ProcessNum, CDF_filename, TypeOfCalculation):
'''
Reads a NetCDF file and separates all the values of the variable into different files according to the bin they fall in.
The variable is chosen by the <TypeOfCalculation> argument.
The process saves several files in its own folder with the name: TMP_FOLDER+"proc"+<ProcessNum>+"/".
The folder contains one binary file for each bin. The file contains all values of the variable which fall inside the bin.
The function also saves into the result files the Number Of Measurements per bin it has processed.
Args:
ProcessNum (int):
CDF_filename (string):
TypeOfCalculation (string):
'''
# check if the data of this process have already been calculated
procfolder = TMP_FOLDER+"proc"+ f"{ProcessNum:03}" +"/"
if os.path.isdir(procfolder):
print( "Proc ", ProcessNum, "already calculated.", flush=True )
return # <<<<
else:
if os.path.isdir(TMP_FOLDER)==False: os.mkdir( TMP_FOLDER )
os.mkdir( procfolder )
# open netCDF file
print("Proc",ProcessNum,"reading ",CDF_filename[CDF_filename.rfind('/')+1:], datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"), flush=True)
try:
CDFroot = Dataset( CDF_filename, 'r' )
except:
print ( " !!! WRONG FORMAT:", CDF_filename, flush=True )
#os.remove("ReadingFile.flag") # lower the reading-file flag
return
# read the data from the netCDF file
#TIMEs = CDFroot.variables['time'][:]
if "Ohmic" in TypeOfCalculation or "JH" in TypeOfCalculation: Ohmics = CDFroot.variables['Ohmic'][:, :, :, :] # m/s
if "SIGMA_PED" in TypeOfCalculation: PEDs = CDFroot.variables['SIGMA_PED'][:, :, :, :]
if "SIGMA_HAL" in TypeOfCalculation: HALs = CDFroot.variables['SIGMA_HAL'][:, :, :, :]
if "EEX_si" in TypeOfCalculation: EEXs = CDFroot.variables['EEX_si'][:, :, :, :]
if "EEY_si" in TypeOfCalculation: EEYs = CDFroot.variables['EEY_si'][:, :, :, :]
if "Convection_heating" in TypeOfCalculation: ConvH = CDFroot.variables['Convection_heating'][:, :, :, :]
if "Wind_heating" in TypeOfCalculation: WindH = CDFroot.variables['Wind_heating'][:, :, :, :]
if "JHminusWindHeat" in TypeOfCalculation: WindH = CDFroot.variables['Wind_heating'][:, :, :, :]
#
LATs = CDFroot.variables['lat'][:]
#MLATs = CDFroot.variables['mlat_qdf'][:, :, :, :]
MLTs = CDFroot.variables['mlt_qdf'][:, :, :, :]
ALTs = CDFroot.variables['ZGMID'][:, :, :, :] / 100000 # Geometric height stored in cm, converted to km
KPs = CDFroot.variables['Kp'][:]
hits = 0 # num of instances that fit in any of the defined bins
ResultBuckets = Data.init_ResultDataStructure().copy()
num_of_unbinned_items = 0
step = 1
for idx_time in range(0, len(ALTs), step):
# $$$$$$$$ for each moment in time put the values in their bins and calculate the mean of each bin.
SingleMomentBuckets = Data.init_ResultDataStructure().copy()
for idx_lev in range(0, len(ALTs[0]), step):
for idx_lat in range(0, len(ALTs[0,0]), step):
for idx_lon in range(0, len(ALTs[0,0,0]), step):
curr_alt_km = ALTs[idx_time, idx_lev, idx_lat, idx_lon]
# ignore values for out-of-range positions
if curr_alt_km<Data.ALT_min or curr_alt_km>Data.ALT_max:
continue
curr_kp = KPs[idx_time]
curr_mlt = MLTs[idx_time, idx_lev, idx_lat, idx_lon]
curr_lat = LATs[idx_lat]
kp_to_fall,alt_to_fall,lat_to_fall,mlt_to_fall = Data.LocatePositionInBuckets(curr_kp,curr_alt_km,curr_lat,curr_mlt)
if kp_to_fall is None or alt_to_fall is None or lat_to_fall is None or mlt_to_fall is None:
num_of_unbinned_items += 1
break # no other longitude can have a hit either
else:
if TypeOfCalculation=="JHminusWindHeat" or TypeOfCalculation=="JHminusWindHeatEISCAT":
aValue = Ohmics[idx_time, idx_lev, idx_lat, idx_lon] - WindH[idx_time, idx_lev, idx_lat, idx_lon]
if aValue > 100: continue # ignore faulty large values
elif TypeOfCalculation=="Ohmic":
aValue = Ohmics[idx_time, idx_lev, idx_lat, idx_lon]
if aValue > 100: continue # ignore faulty large values
elif "SIGMA_PED" in TypeOfCalculation:
aValue = PEDs[idx_time, idx_lev, idx_lat, idx_lon]
elif "HallCond" in TypeOfCalculation:
aValue = HALs[idx_time, idx_lev, idx_lat, idx_lon]
elif "EEX_si" in TypeOfCalculation:
aValue = EEXs[idx_time, idx_lev, idx_lat, idx_lon]
elif "EEY_si" in TypeOfCalculation:
aValue = EEYs[idx_time, idx_lev, idx_lat, idx_lon]
elif "Convection_heating" in TypeOfCalculation:
aValue = ConvH[idx_time, idx_lev, idx_lat, idx_lon]
if aValue > 100: continue # ignore faulty large values
elif "Wind_heating" in TypeOfCalculation:
aValue = WindH[idx_time, idx_lev, idx_lat, idx_lon]
else:
print("ERROR: UNRECOGNISED TypeOfCalculation '" + TypeOfCalculation + "'")
CDFroot.close()
return
# bin this value
SingleMomentBuckets[ kp_to_fall, alt_to_fall, lat_to_fall, mlt_to_fall, "Vals" ].append( aValue )
# if weights are enabled then store the value's weight as well
'''
TIEGCM latitudes are: 68.75, 71.25, 73.75, 76.25, 78.75
https://en.wikipedia.org/wiki/Spherical_segment
Area of a sphere segment = 2 * pi * R * height of the segment (the distance from one parallel plane to the other)
R = 6477km
sin(angleFrom) = h_from / R
sin(angleTo) = h_to / R
'''
if USE_WEIGHTED_AVERAGE:
weight = 1
if curr_lat == 68.75:
weight = 0.0410
elif curr_lat == 71.25:
weight = 0.381
elif curr_lat == 73.75:
weight = 0.328
elif curr_lat == 76.25:
weight = 0.249
else:
print( ">>>> ", curr_lat )
SingleMomentBuckets[ kp_to_fall, alt_to_fall, lat_to_fall, mlt_to_fall, "Weights" ].append( weight )
# keep tracks of the number of the total binned values
hits +=1
# $$$$$$$$ the averages of each time moment are stored in their bin. The percentiles will be calculated on them at the end
if USE_WEIGHTED_AVERAGE: # weighted average
for aKP in Data.KPsequence:
for anALT in Data.ALTsequence:
for aLat in Data.LATsequence:
for aMLT in Data.MLTsequence:
L = len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"])
if L > 0:
S = 0
sum_of_weights = 0
BinVals = SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"]
BinWeights = SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Weights"]
for i in range(0, len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"])):
S += BinWeights[i] * BinVals[i]
sum_of_weights += BinWeights[i]
ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"].append( S / sum_of_weights )
else: # normal average
for aKP in Data.KPsequence:
for aMLT in Data.MLTsequence:
subfigure_N = 0
for aLat in Data.LATsequence:
for anALT in Data.ALTsequence:
L = len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"])
if L > 0:
S = sum(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"])
ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"].append( S / L )
subfigure_N += L
if subfigure_N > 0: Store_NumOfMeasurements(aKP,aMLT, subfigure_N)
# close cdf
CDFroot.close()
# ---- save results
# save values of each bin in a binary file
for aKP in Data.KPsequence:
for anALT in Data.ALTsequence:
for aLat in Data.LATsequence:
for aMLT in Data.MLTsequence:
if len( ResultBuckets[ aKP, anALT, aLat, aMLT, "Vals" ] ) > 0:
fname = str(aKP) + "_" + str(anALT) + "_" + str(aLat) + "_" + str(aMLT) + ".dat"
f = open( procfolder + fname, "wb" )
float_array = array('d', ResultBuckets[aKP, anALT, aLat, aMLT, "Vals"])
float_array.tofile(f)
f.close()
# -------- print result message
msg = "Proc " + str(ProcessNum) + " " + CDF_filename[-20:] + " Hits=" + str(hits)
for aKP in Data.KPsequence:
msg += "\n"
for aMLT in Data.MLTsequence:
n = 0
for aLat in Data.LATsequence:
for anALT in Data.ALTsequence:
n += len(ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"])
msg += " " + str(n)
msg += " " + datetime.datetime.now().strftime("%H:%M:%S") + "\n"
print(msg, flush=True)
def Store_NumOfMeasurements(aKP, aMLT, curr_num):
'''
Stores the number of measurements that fall inside each sub-region defined by its lower Kp limit and lower MLT limit.
There is one file per sub-region.
This function is necessary because the data are processed by different processes.
Args:
aKP (float):
aMLT (float):
curr_num (int):
'''
# LOCK
wait = True
while( wait ):
if os.path.exists( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp" ) == False:
try:
f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp", "x" )
f.close()
wait = False
except:
wait = True
if wait: time.sleep( random.randint(0,1) )
# WORK
prev_num = 0
fname = str(aKP) + "_" + str(aMLT) + ".txt"
if os.path.exists( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname ):
f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname, "r" )
prev_num = int( f.read() )
f.close()
f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname, "w" )
f.write( str(prev_num + curr_num) )
f.close()
# UNLOCK
try:
os.remove( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp" )
except:
pass
Functions
def PROC_StatsCalculator(ProcessNum, CDF_filename, TypeOfCalculation)
-
Reads a NetCDF file and separates all the values of the variable into different files according to the bin they fall in.
The variable is chosen by theargument.
The process saves several files in its own folder with the name: TMP_FOLDER+"proc"++"/".
The folder contains one binary file for each bin. The file contains all values of the variable which fall inside the bin. The function also saves into the result files the Number Of Measurements per bin it has processed.Args
ProcessNum (int): CDF_filename (string): TypeOfCalculation (string):
Expand source code
def PROC_StatsCalculator(ProcessNum, CDF_filename, TypeOfCalculation): ''' Reads a NetCDF file and separates all the values of the variable into different files according to the bin they fall in. The variable is chosen by the <TypeOfCalculation> argument. The process saves several files in its own folder with the name: TMP_FOLDER+"proc"+<ProcessNum>+"/". The folder contains one binary file for each bin. The file contains all values of the variable which fall inside the bin. The function also saves into the result files the Number Of Measurements per bin it has processed. Args: ProcessNum (int): CDF_filename (string): TypeOfCalculation (string): ''' # check if the data of this process have already been calculated procfolder = TMP_FOLDER+"proc"+ f"{ProcessNum:03}" +"/" if os.path.isdir(procfolder): print( "Proc ", ProcessNum, "already calculated.", flush=True ) return # <<<< else: if os.path.isdir(TMP_FOLDER)==False: os.mkdir( TMP_FOLDER ) os.mkdir( procfolder ) # open netCDF file print("Proc",ProcessNum,"reading ",CDF_filename[CDF_filename.rfind('/')+1:], datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"), flush=True) try: CDFroot = Dataset( CDF_filename, 'r' ) except: print ( " !!! WRONG FORMAT:", CDF_filename, flush=True ) #os.remove("ReadingFile.flag") # lower the reading-file flag return # read the data from the netCDF file #TIMEs = CDFroot.variables['time'][:] if "Ohmic" in TypeOfCalculation or "JH" in TypeOfCalculation: Ohmics = CDFroot.variables['Ohmic'][:, :, :, :] # m/s if "SIGMA_PED" in TypeOfCalculation: PEDs = CDFroot.variables['SIGMA_PED'][:, :, :, :] if "SIGMA_HAL" in TypeOfCalculation: HALs = CDFroot.variables['SIGMA_HAL'][:, :, :, :] if "EEX_si" in TypeOfCalculation: EEXs = CDFroot.variables['EEX_si'][:, :, :, :] if "EEY_si" in TypeOfCalculation: EEYs = CDFroot.variables['EEY_si'][:, :, :, :] if "Convection_heating" in TypeOfCalculation: ConvH = CDFroot.variables['Convection_heating'][:, :, :, :] if "Wind_heating" in TypeOfCalculation: WindH = CDFroot.variables['Wind_heating'][:, :, :, :] if "JHminusWindHeat" in TypeOfCalculation: WindH = CDFroot.variables['Wind_heating'][:, :, :, :] # LATs = CDFroot.variables['lat'][:] #MLATs = CDFroot.variables['mlat_qdf'][:, :, :, :] MLTs = CDFroot.variables['mlt_qdf'][:, :, :, :] ALTs = CDFroot.variables['ZGMID'][:, :, :, :] / 100000 # Geometric height stored in cm, converted to km KPs = CDFroot.variables['Kp'][:] hits = 0 # num of instances that fit in any of the defined bins ResultBuckets = Data.init_ResultDataStructure().copy() num_of_unbinned_items = 0 step = 1 for idx_time in range(0, len(ALTs), step): # $$$$$$$$ for each moment in time put the values in their bins and calculate the mean of each bin. SingleMomentBuckets = Data.init_ResultDataStructure().copy() for idx_lev in range(0, len(ALTs[0]), step): for idx_lat in range(0, len(ALTs[0,0]), step): for idx_lon in range(0, len(ALTs[0,0,0]), step): curr_alt_km = ALTs[idx_time, idx_lev, idx_lat, idx_lon] # ignore values for out-of-range positions if curr_alt_km<Data.ALT_min or curr_alt_km>Data.ALT_max: continue curr_kp = KPs[idx_time] curr_mlt = MLTs[idx_time, idx_lev, idx_lat, idx_lon] curr_lat = LATs[idx_lat] kp_to_fall,alt_to_fall,lat_to_fall,mlt_to_fall = Data.LocatePositionInBuckets(curr_kp,curr_alt_km,curr_lat,curr_mlt) if kp_to_fall is None or alt_to_fall is None or lat_to_fall is None or mlt_to_fall is None: num_of_unbinned_items += 1 break # no other longitude can have a hit either else: if TypeOfCalculation=="JHminusWindHeat" or TypeOfCalculation=="JHminusWindHeatEISCAT": aValue = Ohmics[idx_time, idx_lev, idx_lat, idx_lon] - WindH[idx_time, idx_lev, idx_lat, idx_lon] if aValue > 100: continue # ignore faulty large values elif TypeOfCalculation=="Ohmic": aValue = Ohmics[idx_time, idx_lev, idx_lat, idx_lon] if aValue > 100: continue # ignore faulty large values elif "SIGMA_PED" in TypeOfCalculation: aValue = PEDs[idx_time, idx_lev, idx_lat, idx_lon] elif "HallCond" in TypeOfCalculation: aValue = HALs[idx_time, idx_lev, idx_lat, idx_lon] elif "EEX_si" in TypeOfCalculation: aValue = EEXs[idx_time, idx_lev, idx_lat, idx_lon] elif "EEY_si" in TypeOfCalculation: aValue = EEYs[idx_time, idx_lev, idx_lat, idx_lon] elif "Convection_heating" in TypeOfCalculation: aValue = ConvH[idx_time, idx_lev, idx_lat, idx_lon] if aValue > 100: continue # ignore faulty large values elif "Wind_heating" in TypeOfCalculation: aValue = WindH[idx_time, idx_lev, idx_lat, idx_lon] else: print("ERROR: UNRECOGNISED TypeOfCalculation '" + TypeOfCalculation + "'") CDFroot.close() return # bin this value SingleMomentBuckets[ kp_to_fall, alt_to_fall, lat_to_fall, mlt_to_fall, "Vals" ].append( aValue ) # if weights are enabled then store the value's weight as well ''' TIEGCM latitudes are: 68.75, 71.25, 73.75, 76.25, 78.75 https://en.wikipedia.org/wiki/Spherical_segment Area of a sphere segment = 2 * pi * R * height of the segment (the distance from one parallel plane to the other) R = 6477km sin(angleFrom) = h_from / R sin(angleTo) = h_to / R ''' if USE_WEIGHTED_AVERAGE: weight = 1 if curr_lat == 68.75: weight = 0.0410 elif curr_lat == 71.25: weight = 0.381 elif curr_lat == 73.75: weight = 0.328 elif curr_lat == 76.25: weight = 0.249 else: print( ">>>> ", curr_lat ) SingleMomentBuckets[ kp_to_fall, alt_to_fall, lat_to_fall, mlt_to_fall, "Weights" ].append( weight ) # keep tracks of the number of the total binned values hits +=1 # $$$$$$$$ the averages of each time moment are stored in their bin. The percentiles will be calculated on them at the end if USE_WEIGHTED_AVERAGE: # weighted average for aKP in Data.KPsequence: for anALT in Data.ALTsequence: for aLat in Data.LATsequence: for aMLT in Data.MLTsequence: L = len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"]) if L > 0: S = 0 sum_of_weights = 0 BinVals = SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"] BinWeights = SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Weights"] for i in range(0, len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"])): S += BinWeights[i] * BinVals[i] sum_of_weights += BinWeights[i] ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"].append( S / sum_of_weights ) else: # normal average for aKP in Data.KPsequence: for aMLT in Data.MLTsequence: subfigure_N = 0 for aLat in Data.LATsequence: for anALT in Data.ALTsequence: L = len(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"]) if L > 0: S = sum(SingleMomentBuckets[aKP,anALT,aLat,aMLT,"Vals"]) ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"].append( S / L ) subfigure_N += L if subfigure_N > 0: Store_NumOfMeasurements(aKP,aMLT, subfigure_N) # close cdf CDFroot.close() # ---- save results # save values of each bin in a binary file for aKP in Data.KPsequence: for anALT in Data.ALTsequence: for aLat in Data.LATsequence: for aMLT in Data.MLTsequence: if len( ResultBuckets[ aKP, anALT, aLat, aMLT, "Vals" ] ) > 0: fname = str(aKP) + "_" + str(anALT) + "_" + str(aLat) + "_" + str(aMLT) + ".dat" f = open( procfolder + fname, "wb" ) float_array = array('d', ResultBuckets[aKP, anALT, aLat, aMLT, "Vals"]) float_array.tofile(f) f.close() # -------- print result message msg = "Proc " + str(ProcessNum) + " " + CDF_filename[-20:] + " Hits=" + str(hits) for aKP in Data.KPsequence: msg += "\n" for aMLT in Data.MLTsequence: n = 0 for aLat in Data.LATsequence: for anALT in Data.ALTsequence: n += len(ResultBuckets[aKP,anALT,aLat,aMLT,"Vals"]) msg += " " + str(n) msg += " " + datetime.datetime.now().strftime("%H:%M:%S") + "\n" print(msg, flush=True)
def StartCalculating(NetCDF_files_path, ResultFilename, TypeOfCalculation, TmpFilesPath='/media/balukid/STATStmp/')
-
This function manages the statistical calulation:
- For each TIE-GCM netCDF file it spawns a process which will store its results into temporary files at its own folder. Many procecess are utilised in order to accelerate the calculation by leveraging multiple cores.
- After all processes finish, this function merges all temporary files, calculates percentiles etc for each bin and creates a results netCDF file.
This function should be called after the Data.setDataParams() function has initialized the bin ranges.Args
NetCDF_files_path
:string
- The path where all the TIE-GCM netCDF files are stored. In wildcard format. Example: "./data//.nc".
ResultFilename
:string
- The filename where the final calculation results will be stored.
TypeOfCalculation
:string
- The variable which upon which the calculation will be applied. See Data.setDataParams() for more.
TmpFilesPath
:string
- The path where the temporary files will be stores
Expand source code
def StartCalculating( NetCDF_files_path, ResultFilename, TypeOfCalculation, TmpFilesPath="/media/balukid/STATStmp/" ): ''' This function manages the statistical calulation: - For each TIE-GCM netCDF file it spawns a process which will store its results into temporary files at its own folder. Many procecess are utilised in order to accelerate the calculation by leveraging multiple cores. - After all processes finish, this function merges all temporary files, calculates percentiles etc for each bin and creates a results netCDF file. This function should be called after the Data.setDataParams() function has initialized the bin ranges. Args: NetCDF_files_path (string): The path where all the TIE-GCM netCDF files are stored. In wildcard format. Example: "./data/*/*.nc". ResultFilename (string): The filename where the final calculation results will be stored. TypeOfCalculation (string): The variable which upon which the calculation will be applied. See Data.setDataParams() for more. TmpFilesPath (string): The path where the temporary files will be stores ''' global TMP_FOLDER global theResultFile_folder global theResultFile_simplename theResultFile_folder = ResultFilename[ 0 : ResultFilename.rfind('/') ] theResultFile_simplename = ResultFilename[ ResultFilename.rfind('/')+1 : ][0:-3] if not os.path.exists(theResultFile_folder+"/"+theResultFile_simplename): os.makedirs(theResultFile_folder+"/"+theResultFile_simplename) startSecs = time.time() print( "START", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S") ) TMP_FOLDER = TmpFilesPath Allprocesses = list() AllCDFfiles = sorted( glob.glob( NetCDF_files_path, recursive=True ) ) print( "I will calculate '" + TypeOfCalculation + "' on", len(AllCDFfiles), "files:\n ", NetCDF_files_path, "\n" ) print( "Results will be stored in '" + ResultFilename + "'\n" ) n = 0 for CDF_file in AllCDFfiles: n += 1 Data.Progress = int( 100 * n/221) # spawn new process P = multiprocessing.Process(target=PROC_StatsCalculator, args=(n,CDF_file,TypeOfCalculation)) Allprocesses.append(P) P.start() pause_spawning = True while pause_spawning: Num_of_alive_processes = 0 for P in Allprocesses: if P.is_alive(): Num_of_alive_processes += 1 if Num_of_alive_processes >= NUM_OF_PROCESSORS: pause_spawning = True time.sleep(12) else: pause_spawning = False # wait for all processes to terminate for T in Allprocesses: T.join() # every process creates a partial file, merge all of them into one print( "Merging partial data files and calculating result values...", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S")) ResultBuckets = Data.init_ResultDataStructure() NumOfBins = len(Data.KPsequence) * len(Data.ALTsequence) * len(Data.LATsequence) * len(Data.MLTsequence) CurrBinNum = 0 print( "Data.KPsequence: ", Data.KPsequence) print( "Data.ALTsequence", Data.ALTsequence) print( "Data.LATsequence", Data.LATsequence) print( "Data.MLTsequence", Data.MLTsequence) for aKP in Data.KPsequence: for aMLT in Data.MLTsequence: RegionHits = 0 for anALT in Data.ALTsequence: for aLat in Data.LATsequence: CurrBinNum += 1 Data.Progress = int( 100 * CurrBinNum/NumOfBins ) AllBinValues = list() All_partialData_folders = sorted( glob.glob( TMP_FOLDER+"proc*", recursive=False ) ) for partialDataFolder in All_partialData_folders: # read all partial files for this bin partialDataFolder = partialDataFolder + "/" if os.path.isdir(partialDataFolder)==False: continue partialDataFilename = partialDataFolder + str(aKP)+"_"+str(anALT)+"_"+str(aLat)+"_"+str(aMLT)+".dat" if os.path.exists(partialDataFilename) == False: # no hits for this bin from this process continue f = open(partialDataFilename, "rb") float_array = array('d') float_array.frombytes(f.read()) AllBinValues += float_array.tolist() f.close() print("BIN", "Kp"+str(aKP), "Alt"+str(anALT), "Lat"+str(aLat), "MLT"+str(aMLT), "", len(AllBinValues), "items" ) RegionHits += len(AllBinValues) if len(AllBinValues) > 0: ResultBuckets[aKP, anALT, aLat, aMLT, "Sum"] = np.sum(AllBinValues) ResultBuckets[aKP, anALT, aLat, aMLT, "Len"] = len(AllBinValues) ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile10"] = np.percentile(AllBinValues, 10) ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile25"] = np.percentile(AllBinValues, 25) ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile50"] = np.percentile(AllBinValues, 50) ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile75"] = np.percentile(AllBinValues, 75) ResultBuckets[aKP, anALT, aLat, aMLT, "Percentile90"] = np.percentile(AllBinValues, 90) ResultBuckets[aKP, anALT, aLat, aMLT, "Variance"] = np.var(AllBinValues) ResultBuckets[aKP, anALT, aLat, aMLT, "Minimum"] = np.nanmin(AllBinValues) ResultBuckets[aKP, anALT, aLat, aMLT, "Maximum"] = np.nanmax(AllBinValues) # calculate distribution if Data.DistributionNumOfSlots > 0: histo_values, histo_ranges = np.histogram(AllBinValues, Data.DistributionNumOfSlots, (0, 0.0000001)) for i in range(0, Data.DistributionNumOfSlots): ResultBuckets[aKP, anALT, aLat, aMLT, "Distribution"][i] = histo_values[i] print( "REGION:", "Kp"+str(aKP), "MLT"+str(aMLT), ":", RegionHits, "measurements." ) print("BIN", "Kp"+str(aKP), "Alt"+str(anALT), "Lat"+str(aLat), "MLT"+str(aMLT), "", len(AllBinValues), "items" ) if "Ohmic" in TypeOfCalculation: Data.WriteResultsToCDF(ResultBuckets, ResultFilename, "Joule Heating", "W/m3") elif "SIGMA_PED" in TypeOfCalculation: Data.WriteResultsToCDF(ResultBuckets, ResultFilename, "Pedersen Conductivity", "S/m") else: Data.WriteResultsToCDF(ResultBuckets, ResultFilename, TypeOfCalculation, "") # DO NOT DELL PARTIAL FILES - THEY CAN BE USED TO CONTINUE THE CALCULATION AFTER AN INTERMEDIATE HALT #try: # delete temporary files, which contain all values for each bin # shutil.rmtree( TMP_FOLDER ) #except: # pass finishSecs = time.time() print( "FINISH", datetime.datetime.now().strftime("%d-%m-%Y %H:%M:%S"), " (", finishSecs-startSecs, "sec )")
def Store_NumOfMeasurements(aKP, aMLT, curr_num)
-
Stores the number of measurements that fall inside each sub-region defined by its lower Kp limit and lower MLT limit. There is one file per sub-region. This function is necessary because the data are processed by different processes.
Args
aKP (float): aMLT (float): curr_num (int):
Expand source code
def Store_NumOfMeasurements(aKP, aMLT, curr_num): ''' Stores the number of measurements that fall inside each sub-region defined by its lower Kp limit and lower MLT limit. There is one file per sub-region. This function is necessary because the data are processed by different processes. Args: aKP (float): aMLT (float): curr_num (int): ''' # LOCK wait = True while( wait ): if os.path.exists( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp" ) == False: try: f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp", "x" ) f.close() wait = False except: wait = True if wait: time.sleep( random.randint(0,1) ) # WORK prev_num = 0 fname = str(aKP) + "_" + str(aMLT) + ".txt" if os.path.exists( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname ): f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname, "r" ) prev_num = int( f.read() ) f.close() f = open( theResultFile_folder + "/" + theResultFile_simplename + "/" + fname, "w" ) f.write( str(prev_num + curr_num) ) f.close() # UNLOCK try: os.remove( theResultFile_folder + "/" + theResultFile_simplename + "/" + "lock.tmp" ) except: pass