Source code for paramagpy.fit

import numpy as np
from scipy.optimize import fmin_bfgs
from pprint import pprint
import warnings
from collections import OrderedDict

structdtype = np.dtype([
	('mdl', int          ),
	('atm', object       ),
	('pos', float,  3    ),
	('exp', float        ),
	('cal', float        ),
	('err', float        ),
	('idx', int          ),
	('gam', float        ),
	('xsa', float, (3,3) )])

[docs]def extract_atom_data(data, csa=False, separateModels=True): """ Extract values required for PCS/PRE calculations Parameters ---------- data : numpy.ndarray a numpy structured array containing atomic information and experimental data values This is returned from. :meth:`paramagpy.protein.CustomStructure.parse` csa : bool, optional when True, calculates the CSA tensor for each atom this may be required for RACS and CSAxDSA calculations separateModels : bool, optional when True, separates data into separate lists by their model number. When False, returns only one list Returns ------- arr : numpy.ndarra this has fields specified by structdtype this array is core to all fitting algorithms """ arr = np.empty(len(data), dtype=structdtype) for name in ('mdl', 'atm', 'exp', 'cal'): arr[name] = data[name] _, arr['idx'] = np.unique(data['idx'], return_inverse=True) if 0.0 in data['err']: arr['err'] = np.ones(len(data['err'])) warnings.warn("0.0 value uncertainty. All values weighted evenly") else: arr['err'] = data['err'] arr['pos'] = [a.position for a in data['atm']] arr['gam'] = [a.gamma for a in data['atm']] if csa: arr['xsa'] = [a.csa for a in data['atm']] if separateModels: return [arr[arr['mdl']==m] for m in np.unique(arr['mdl'])] else: return [arr]
[docs]def extract_rdc_data(data, separateModels=True): """ Extract values required for RDC calculations Parameters ---------- data : numpy.ndarray a numpy structured array containing atomic information and experimental data values This is returned from :meth:`paramagpy.protein.CustomStructure.parse` separateModels : bool, optional when True, separates data into separate lists by their model number. When False, returns only one list Returns ------- arr : numpy.ndarra this has fields specified by structdtype this array is core to all fitting algorithms """ arr = np.empty(len(data), dtype=structdtype) for name in ('mdl', 'atm', 'exp', 'cal'): arr[name] = data[name] _, arr['idx'] = np.unique(data['idx'], return_inverse=True) if 0.0 in data['err']: arr['err'] = np.ones(len(data['err'])) warnings.warn("0.0 value uncertainty. All values weighted evenly") else: arr['err'] = data['err'] arr['pos'] = [a2.position - a1.position for a1, a2 in data[['atm','atx']]] arr['gam'] = [a1.gamma * a2.gamma for a1, a2 in data[['atm','atx']]] if separateModels: return [arr[arr['mdl']==m] for m in np.unique(arr['mdl'])] else: return [arr]
[docs]def extract_ccr_data(data, separateModels=True): """ Extract values required for CCR calculations Parameters ---------- data : numpy.ndarray a numpy structured array containing atomic information and experimental data values This is returned from :meth:`paramagpy.protein.CustomStructure.parse` separateModels : bool, optional when True, separates data into separate lists by their model number. When False, returns only one list Returns ------- arr : numpy.ndarra this has fields specified by structdtype this array is core to all fitting algorithms """ arr = np.empty(len(data), dtype=structdtype) for name in ('mdl', 'atm', 'exp', 'cal'): arr[name] = data[name] _, arr['idx'] = np.unique(data['idx'], return_inverse=True) if 0.0 in data['err']: arr['err'] = np.ones(len(data['err'])) warnings.warn("0.0 value uncertainty. All values weighted evenly") else: arr['err'] = data['err'] arr['pos'] = [a.position for a in data['atm']] arr['gam'] = [a.gamma for a in data['atm']] arr['xsa'] = [p.dipole_shift_tensor(a.position) for a,p in data[['atm','atx']]] if separateModels: return [arr[arr['mdl']==m] for m in np.unique(arr['mdl'])] else: return [arr]
[docs]def sphere_grid(origin, radius, points): """ Make a grid of cartesian points within a sphere Parameters ---------- origin : float the centre of the sphere radius : float the radius of the sphere points : int the number of points per radius Returns ------- array : array of [x,y,z] coordinates the points within the sphere """ s = np.linspace(-radius, radius, 2*points-1) mgrid = np.array(np.meshgrid(s, s, s, indexing='ij')).T.reshape(len(s)**3,3) norms = np.linalg.norm(mgrid, axis=1) sphere_idx = np.where(norms<=radius) return mgrid[sphere_idx] + origin
[docs]def svd_calc_metal_from_pcs(pos, pcs, idx, errors): """ Solve PCS equation by single value decomposition. This function is generally called by higher methods like <svd_gridsearch_fit_metal_from_pcs> Parameters ---------- pos : array of [x,y,z] floats the atomic positions in meters pcs : array of floats the PCS values in ppm. Note these should be weighted by the experimental uncertainties. idx : array of ints an index assigned to each atom. Common indices determine summation between models for ensemble averaging. errors : array of floats the standard deviation representing experimental uncertainty in the measured value Returns ------- calc : array of floats the calculated PCS values from the fitted tensor sol : array of floats solution to the linearised PCS equation and consists of the tensor 5 matrix elements """ floatscale = 1E-24 dist = np.linalg.norm(pos, axis=1) x, y, z = pos.T a = x**2 - z**2 b = y**2 - z**2 c = 2 * x * y d = 2 * x * z e = 2 * y * z mat = (1./(4.*np.pi*dist**5)) * np.array([a,b,c,d,e]) / errors mat = np.array([np.bincount(idx, weights=col) for col in mat])*floatscale matinv = np.linalg.pinv(mat) sol = matinv.T.dot(pcs*1E-6) calc = mat.T.dot(sol)*1E6 return calc, sol*floatscale
[docs]def svd_calc_metal_from_pcs_offset(pos, pcs, idx, errors): """ Solve PCS equation by single value decomposition with offset. An offset arising from referencing errors between diamagnetic and paramagnetic datasets can be accounted for using this method. This function is generally called by higher methods like <svd_gridsearch_fit_metal_from_pcs> NOTE: the factor of 1E26 is required for floating point error mitigation Parameters ---------- pos : array of [x,y,z] floats the atomic positions in meters pcs : array of floats the PCS values in ppm. Note these should be weighted by the experimental uncertainties. idx : array of ints an index assigned to each atom. Common indices determine summation between models for ensemble averaging. errors : array of floats the standard deviation representing experimental uncertainty in the measured value Returns ------- calc : array of floats the calculated PCS values from the fitted tensor sol : array of floats solution to the linearised PCS equation and consists of the tensor 5 matrix elements and offset values """ floatscale = 1E-24 dist = np.linalg.norm(pos, axis=1) x, y, z = pos.T a = x**2 - z**2 b = y**2 - z**2 c = 2 * x * y d = 2 * x * z e = 2 * y * z scale = 1./(4.*np.pi*dist**5) mat = scale * np.array([a,b,c,d,e,1E26/scale]) / errors mat = np.array([np.bincount(idx, weights=col) for col in mat])*floatscale matinv = np.linalg.pinv(mat) sol = matinv.T.dot(pcs*1E-6) calc = mat.T.dot(sol)*1E6 sol[-1] *= 1E26 return calc, sol*floatscale
[docs]def svd_gridsearch_fit_metal_from_pcs(initMetals, dataArrays, ensembleAverage=False, origin=None, radius=20.0, points=16, offsetShift=False, progress=None): """ Fit deltaChi tensor to PCS values using Single Value Decomposition over a grid of points in a sphere. Note this uses a weighted SVD fit which takes into account experimental errors Parameters ---------- initMetals : list of Metal objects a list of metals used as starting points for fitting. a list must always be provided, but may also contain only one element. If multiple metals are provided, each metal is fitted to their respective PCS dataset by index in <dataArrays, but all are fitted to a common position. dataArrays : list of PCS dataArray each PCS dataArray must correspond to an associated metal for fitting. each PCS dataArray has structure determined by :meth:`paramagpy.protein.CustomStructure.parse`. ensembleAverage : bool, optional when False, each model of the structure is fit independently. The parameters for each fitted tensor are then averaged before returning the final averaged tensor. When True, the structure models are treated as an ensemble and ensemble averaging of calculated PCS/PRE/RDC/CCR values is conducted at all stages of fitting to fit a single tensor to all models simultaneously. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. origin : float, optional the centre of the gridsearch of positions in Angstroms. If None, the position of the first metal is used radius : float, optional the radius of the gridsearch in Angstroms. points : int, optional the number of points per radius in the gridsearch offsetShift : bool, optional if True, an offset value added to all PCS values is included in the SVD fitting. This may arise due to a referencing error between diamagnetic and paramagnetic PCS datasets and may be used when many data points are available. Default False, no offset is included in the fitting. progress : object, optional to keep track of the calculation, progress.set(x) is called each iteration and varies from 0.0 -> 1.0 when the calculation is complete. Returns ------- fitMetals : list of metals a list of the fitted tensors. dataArrays : list of dataArray each dataArray is copy of the original dataArray with the 'cal' column populated with back-calculated values from the fitted tensor. """ if len(initMetals)!=len(dataArrays): raise ValueError("<metals> and <dataArrays> must have same length") if offsetShift: svd_func = svd_calc_metal_from_pcs_offset else: svd_func = svd_calc_metal_from_pcs datas = {} metalAvgs = [] for metal, dataArray in zip(initMetals, dataArrays): metalAvg = [] if ensembleAverage: d = extract_atom_data(dataArray, csa=False, separateModels=False)[0] m = metal.copy() m.par['weightSum'] = np.bincount(d['idx'], weights=d['exp']/d['err']) metalAvg.append(m) if 0 not in datas: datas[0] = [] datas[0].append((m, d)) else: for d in extract_atom_data(dataArray, csa=False, separateModels=True): mdl = d['mdl'][0] if mdl not in datas: datas[mdl] = [] m = metal.copy() m.par['weightSum'] = np.bincount(d['idx'], weights=d['exp']/d['err']) metalAvg.append(m) datas[mdl].append((m, d)) metalAvgs.append(metalAvg) if origin is None: origin = initMetals[0].position*1E10 sphere = sphere_grid(origin, radius, points)*1E-10 tot = len(sphere)*len(datas) prog = 0.0 if tot<1: raise ValueError("Zero grid points selected for SVD search") print("SVD gridsearch started in {} points".format(len(sphere))) for mdl in datas: minscore = 1E308 data = datas[mdl] for pos in sphere: if progress: prog += 1 progress.set(prog/tot) score = 0.0 sols = [] for m, d in data: calculated, solution = svd_func(d['pos']-pos, m.par['weightSum'], d['idx'], d['err']) sols.append(solution) score += np.sum((calculated - m.par['weightSum'])**2) if score<minscore: minscore = score for sol, (m, _) in zip(sols, data): m.position = pos if offsetShift: m.upper_triang = sol[:-1] m.shift = sol[-1]*1E6 else: m.upper_triang = sol fitMetals = [] for metalAvg in metalAvgs: mAvg = metalAvg[0].copy() mAvg.average(metalAvg) mAvg.set_utr() mAvg.par['mav'] = metalAvg fitMetals.append(mAvg) newDataArrays = [] for m, dataArray in zip(fitMetals, dataArrays): d = extract_atom_data(dataArray, separateModels=False)[0] newDataArray = dataArray.copy() newDataArray['cal'] = m.fast_pcs(d['pos']) newDataArrays.append(newDataArray) if progress: progress.set(1.0) return fitMetals, newDataArrays
[docs]def nlr_fit_metal_from_pcs(initMetals, dataArrays, params=('x','y','z','ax','rh','a','b','g'), ensembleAverage=False, userads=False, useracs=False, progress=None): """ Fit deltaChi tensor to PCS values using non-linear regression. Parameters ---------- initMetals : list of Metal objects a list of metals used as starting points for fitting. a list must always be provided, but may also contain only one element. If multiple metals are provided, each metal is fitted to their respective PCS dataset by index in <dataArrays, but all are fitted to a common position. dataArrays : list of PCS dataArray each PCS dataArray must correspond to an associated metal for fitting. each PCS dataArray has structure determined by :meth:`paramagpy.protein.CustomStructure.parse`. params : list of str the parameters to be fit. For example ['x','y','z','ax','rh','a','b','g','shift'] ensembleAverage : bool, optional when False, each model of the structure is fit independently. The parameters for each fitted tensor are then averaged before returning the final averaged tensor. When True, the structure models are treated as an ensemble and ensemble averaging of calculated PCS/PRE/RDC/CCR values is conducted at all stages of fitting to fit a single tensor to all models simultaneously. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. userads : bool, optional include residual anisotropic dipolar shielding (RADS) during fitting useracs : bool, optional include residual anisotropic chemical shielding (RACS) during fitting. CSA tensors are taken using the <csa> method of atoms. progress : object, optional to keep track of the calculation, progress.set(x) is called each iteration and varies from 0.0 -> 1.0 when the calculation is complete. Returns ------- fitMetals : list of metals a list of the fitted tensors. dataArrays : list of dataArray each dataArray is copy of the original dataArray with the 'cal' column populated with back-calculated values from the fitted tensor. """ if len(initMetals)!=len(dataArrays): raise ValueError("initMetals and dataArrays must have same length") datas = {} metalAvgs = [] for metal, dataArray in zip(initMetals, dataArrays): metalAvg = [] if ensembleAverage: d = extract_atom_data(dataArray, csa=useracs, separateModels=False)[0] m = metal.copy() metalAvg.append(m) if 0 not in datas: datas[0] = [] datas[0].append((m, d)) else: for d in extract_atom_data(dataArray, csa=useracs, separateModels=True): mdl = d['mdl'][0] if mdl not in datas: datas[mdl] = [] m = metal.copy() metalAvg.append(m) datas[mdl].append((m, d)) metalAvgs.append(metalAvg) params = set(params) pospars = tuple(params & set(['x','y','z'])) otherpars = tuple(params - set(['x','y','z'])) for mdl in datas: data = datas[mdl] for i, (m, d) in enumerate(data): m.par['pos'] = slice(0, len(pospars)) m.par['oth'] = slice(len(pospars) + i*len(otherpars), len(pospars) + (i+1)*len(otherpars)) tot = len(datas) prog = 0.0 for mdl in datas: data = datas[mdl] startpars = data[0][0].get_params(pospars) for m, _ in data: startpars += m.get_params(otherpars) def cost(args): score = 0.0 for m, d in data: m.set_params(zip(pospars, args[m.par['pos']])) m.set_params(zip(otherpars, args[m.par['oth']])) cal = m.fast_pcs(d['pos']) if userads: cal += m.fast_rads(d['pos']) if useracs: cal += m.fast_racs(d['xsa']) diff = (cal - d['exp']) / d['err'] selectiveSum = np.bincount(d['idx'], weights=diff) score += np.sum(selectiveSum**2) return score fmin_bfgs(cost, startpars, disp=False) if progress: prog += 1 progress.set(prog/tot) fitMetals = [] for metalAvg in metalAvgs: mAvg = metalAvg[0].copy() mAvg.average(metalAvg) mAvg.set_utr() mAvg.par['mav'] = metalAvg fitMetals.append(mAvg) newDataArrays = [] for m, dataArray in zip(fitMetals, dataArrays): d = extract_atom_data(dataArray, csa=useracs, separateModels=False)[0] newDataArray = dataArray.copy() newDataArray['cal'] = m.fast_pcs(d['pos']) if userads: newDataArray['cal'] += m.fast_rads(d['pos']) if useracs: newDataArray['cal'] += m.fast_racs(d['xsa']) newDataArrays.append(newDataArray) if progress: progress.set(1.0) return fitMetals, newDataArrays
[docs]def nlr_fit_metal_from_pre(initMetals, dataArrays, rtypes, params=('x','y','z'), usesbm=True, usegsbm=False, usedsa=True, usecsa=False, ensembleAverage=False, progress=None): """ Fit Chi tensor to PRE values using non-linear regression. Parameters ---------- initMetals : list of Metal objects a list of metals used as starting points for fitting. a list must always be provided, but may also contain only one element. If multiple metals are provided, each metal is fitted to their respective PRE dataset by index in <dataArrays, but all are fitted to a common position. dataArrays : list of PRE dataArray each PRE dataArray must correspond to an associated metal for fitting. each PRE dataArray has structure determined by :meth:`paramagpy.protein.CustomStructure.parse`. rtypes : list of str, optional the relaxtion type, either 'r1' or 'r2'. A list must be provided with each element corresponding to an associated dataset. Defaults to 'r2' for all datasets of None is specified. params : list of str the parameters to be fit. For example ['x','y','z','ax','rh','a','b','g','shift'] usesbm : bool, optional include Solomon-Bloemenbergen-Morgan (Dipole-dipole) relaxation theory. default is True usegsbm : bool, optional include anisotropic dipolar relaxation theory. note that the g-tensor must be set for this default is False usedsa : bool, optional include Dipolar-Shielding-Anisotropy (Curie Spin) relaxation theory. default is True usecsa : bool, optional include Chemical-Shift-Anisotropy cross-correlated realxation theory. default is False ensembleAverage : bool, optional when False, each model of the structure is fit independently. The parameters for each fitted tensor are then averaged before returning the final averaged tensor. When True, the structure models are treated as an ensemble and ensemble averaging of calculated PCS/PRE/RDC/CCR values is conducted at all stages of fitting to fit a single tensor to all models simultaneously. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. progress : object, optional to keep track of the calculation, progress.set(x) is called each iteration and varies from 0.0 -> 1.0 when the calculation is complete. Returns ------- fitMetals : list of metals a list of the fitted tensors. dataArrays : list of dataArray each dataArray is copy of the original dataArray with the 'cal' column populated with back-calculated values from the fitted tensor. """ if len(initMetals)!=len(dataArrays)!=len(rtypes): raise ValueError("initMetals, dataArrays and rtypes must have same length") if len(set(rtypes) | set(['r1','r2']))>2: raise TypeError("rtype must be a list with values 'r1' and 'r2' only") datas = {} metalAvgs = [] for metal, dataArray, rtype in zip(initMetals, dataArrays, rtypes): metalAvg = [] if ensembleAverage: d = extract_atom_data(dataArray, csa=usecsa, separateModels=False)[0] m = metal.copy() m.par['rtp'] = rtype metalAvg.append(m) if 0 not in datas: datas[0] = [] datas[0].append((m, d)) else: for d in extract_atom_data(dataArray, csa=usecsa, separateModels=True): mdl = d['mdl'][0] if mdl not in datas: datas[mdl] = [] m = metal.copy() m.par['rtp'] = rtype metalAvg.append(m) datas[mdl].append((m, d)) metalAvgs.append(metalAvg) params = set(params) pospars = tuple(params & set(['x','y','z'])) otherpars = tuple(params - set(['x','y','z'])) for mdl in datas: data = datas[mdl] for i, (m, d) in enumerate(data): m.par['pos'] = slice(0, len(pospars)) m.par['oth'] = slice(len(pospars) + i*len(otherpars), len(pospars) + (i+1)*len(otherpars)) for mdl in datas: data = datas[mdl] startpars = data[0][0].get_params(pospars) for m, _ in data: startpars += m.get_params(otherpars) def cost(args): score = 0.0 for m, d in data: m.set_params(zip(pospars, args[m.par['pos']])) m.set_params(zip(otherpars, args[m.par['oth']])) cal = m.fast_pre(d['pos'], d['gam'], m.par['rtp'], dsa=usedsa, sbm=usesbm, gsbm=usegsbm, csaarray=d['xsa']) diff = (cal - d['exp']) / d['err'] selectiveSum = np.bincount(d['idx'], weights=diff) score += np.sum(selectiveSum**2) return score fmin_bfgs(cost, startpars, disp=False) fitMetals = [] for metalAvg in metalAvgs: mAvg = metalAvg[0].copy() mAvg.par['rtp'] = metalAvg[0].par['rtp'] mAvg.par['mav'] = metalAvg mAvg.average(metalAvg) mAvg.set_utr() mAvg.metalAvg = metalAvg fitMetals.append(mAvg) newDataArrays = [] for m, dataArray in zip(fitMetals, dataArrays): d = extract_atom_data(dataArray, csa=usecsa, separateModels=False)[0] newDataArray = dataArray.copy() newDataArray['cal'] = m.fast_pre(d['pos'], d['gam'], m.par['rtp'], dsa=usedsa, sbm=usesbm, gsbm=usegsbm, csaarray=d['xsa']) newDataArrays.append(newDataArray) if progress: progress.set(1.0) return fitMetals, newDataArrays
[docs]def svd_calc_metal_from_rdc(vec, rdc_parameterised, idx, errors): """ Solve RDC equation by single value decomposition. This function is generally called by higher methods like <svd_fit_metal_from_rdc> Parameters ---------- vec : array of [x,y,z] floats the internuclear vectors in meters rdc_parameterised : array of floats the experimental RDC values, normalised by a prefactor idx : array of ints an index assigned to each atom. Common indices determine summation between models for ensemble averaging. errors : array of floats the standard deviation representing experimental uncertainty in the measured value Returns ------- calc : array of floats the calculated RDC values from the fitted tensor sol : array of floats sol is the solution to the linearised PCS equation and consists of the tensor matrix elements """ dist = np.linalg.norm(vec, axis=1) x, y, z = vec.T a = x**2 - z**2 b = y**2 - z**2 c = 2 * x * y d = 2 * x * z e = 2 * y * z mat = (1./dist**5) * np.array([a,b,c,d,e]) / errors matSum = np.array([np.bincount(idx, weights=col) for col in mat]) matinv = np.linalg.pinv(matSum) sol = matinv.T.dot(rdc_parameterised) calc = matSum.T.dot(sol) return calc, sol
[docs]def svd_fit_metal_from_rdc(initMetals, dataArrays, params=('ax','rh','a','b','g'), ensembleAverage=False, progress=None): """ Fit deltaChi tensor to RDC values using Single Value Decomposition. Note this is a weighted SVD calculation which takes into account experimental errors. Parameters ---------- initMetals : list of Metal objects a list of metals used as starting points for fitting. a list must always be provided, but may also contain only one element. If multiple metals are provided, each metal is fitted to their respective RDC dataset by index in <dataArrays>. dataArrays : list of PRE dataArray each RDC dataArray must correspond to an associated metal for fitting. each RDC dataArray has structure determined by :meth:`paramagpy.protein.CustomStructure.parse`. params : list of str the parameters to be fit. NOTE: This is a dummy argument and does not influence the fitting. The default parameters ('ax','rh','a','b','g') are the only option. ensembleAverage : bool, optional when False, each model of the structure is fit independently. The parameters for each fitted tensor are then averaged before returning the final averaged tensor. When True, the structure models are treated as an ensemble and ensemble averaging of calculated PCS/PRE/RDC/CCR values is conducted at all stages of fitting to fit a single tensor to all models simultaneously. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. progress : object, optional to keep track of the calculation, progress.set(x) is called each iteration and varies from 0.0 -> 1.0 when the calculation is complete. Returns ------- fitMetals : list of metals a list of the fitted tensors. dataArrays : list of dataArray each dataArray is copy of the original dataArray with the 'cal' column populated with back-calculated values from the fitted tensor. """ if len(initMetals)!=len(dataArrays): raise ValueError("initMetals and dataArrays must have same length") datas = {} metalAvgs = [] for metal, dataArray in zip(initMetals, dataArrays): metalAvg = [] if ensembleAverage: d = extract_rdc_data(dataArray, separateModels=False)[0] m = metal.copy() metalAvg.append(m) if 0 not in datas: datas[0] = [] datas[0].append((m, d)) else: for d in extract_rdc_data(dataArray, separateModels=True): mdl = d['mdl'][0] if mdl not in datas: datas[mdl] = [] m = metal.copy() metalAvg.append(m) datas[mdl].append((m, d)) metalAvgs.append(metalAvg) tot = len(datas) prog = 0.0 for mdl in datas: data = datas[mdl] for m, d in data: pfarray = -3*(m.MU0 * d['gam'] * m.HBAR) / (8 * np.pi**2) rdc_parameterised = np.bincount(d['idx'], weights=d['exp'] / (pfarray * d['err'])) calculated, solution = svd_calc_metal_from_rdc(d['pos'], rdc_parameterised, d['idx'], d['err']) m.upper_triang_alignment = solution if progress: prog += 1 progress.set(prog/tot) fitMetals = [] for metalAvg in metalAvgs: mAvg = metalAvg[0].copy() mAvg.average(metalAvg) mAvg.set_utr() mAvg.par['mav'] = metalAvg fitMetals.append(mAvg) newDataArrays = [] for m, dataArray in zip(fitMetals, dataArrays): d = extract_rdc_data(dataArray, separateModels=False)[0] newDataArray = dataArray.copy() newDataArray['cal'] = m.fast_rdc(d['pos'], d['gam']) newDataArrays.append(newDataArray) if progress: progress.set(1.0) return fitMetals, newDataArrays
[docs]def nlr_fit_metal_from_ccr(initMetals, dataArrays, params=('x','y','z'), ensembleAverage=False, progress=None): """ Fit Chi tensor to CCR values using non-linear regression. This algorithm applies to CSA/Curie spin cross-correlated relaxation for R2 differential line broadening. Parameters ---------- initMetals : list of Metal objects a list of metals used as starting points for fitting. a list must always be provided, but may also contain only one element. If multiple metals are provided, each metal is fitted to their respective PRE dataset by index in <dataArrays, but all are fitted to a common position. dataArrays : list of PRE dataArray each PRE dataArray must correspond to an associated metal for fitting. each PRE dataArray has structure determined by :meth:`paramagpy.protein.CustomStructure.parse`. params : list of str the parameters to be fit. For example ['x','y','z','ax','rh','a','b','g'] ensembleAverage : bool, optional when False, each model of the structure is fit independently. The parameters for each fitted tensor are then averaged before returning the final averaged tensor. When True, the structure models are treated as an ensemble and ensemble averaging of calculated PCS/PRE/RDC/CCR values is conducted at all stages of fitting to fit a single tensor to all models simultaneously. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. progress : object, optional to keep track of the calculation, progress.set(x) is called each iteration and varies from 0.0 -> 1.0 when the calculation is complete. Returns ------- fitMetals : list of metals a list of the fitted tensors. dataArrays : list of dataArray each dataArray is copy of the original dataArray with the 'cal' column populated with back-calculated values from the fitted tensor. """ if len(initMetals)!=len(dataArrays): raise ValueError("initMetals and dataArrays must have same length") datas = {} metalAvgs = [] for metal, dataArray in zip(initMetals, dataArrays): metalAvg = [] if ensembleAverage: d = extract_ccr_data(dataArray, separateModels=False)[0] m = metal.copy() metalAvg.append(m) if 0 not in datas: datas[0] = [] datas[0].append((m, d)) else: for d in extract_ccr_data(dataArray, separateModels=True): mdl = d['mdl'][0] if mdl not in datas: datas[mdl] = [] m = metal.copy() metalAvg.append(m) datas[mdl].append((m, d)) metalAvgs.append(metalAvg) params = set(params) pospars = tuple(params & set(['x','y','z'])) otherpars = tuple(params - set(['x','y','z'])) for mdl in datas: data = datas[mdl] for i, (m, d) in enumerate(data): m.par['pos'] = slice(0, len(pospars)) m.par['oth'] = slice(len(pospars) + i*len(otherpars), len(pospars) + (i+1)*len(otherpars)) tot = len(datas) prog = 0.0 for mdl in datas: data = datas[mdl] startpars = data[0][0].get_params(pospars) for m, _ in data: startpars += m.get_params(otherpars) def cost(args): score = 0.0 for m, d in data: m.set_params(zip(pospars, args[m.par['pos']])) m.set_params(zip(otherpars, args[m.par['oth']])) cal = m.fast_ccr(d['pos'], d['gam'], d['xsa']) diff = (cal - d['exp']) / d['err'] selectiveSum = np.bincount(d['idx'], weights=diff) score += np.sum(selectiveSum**2) return score fmin_bfgs(cost, startpars, disp=False) if progress: prog += 1 progress.set(prog/tot) fitMetals = [] for metalAvg in metalAvgs: mAvg = metalAvg[0].copy() mAvg.average(metalAvg) mAvg.set_utr() mAvg.par['mav'] = metalAvg fitMetals.append(mAvg) newDataArrays = [] for m, dataArray in zip(fitMetals, dataArrays): d = extract_ccr_data(dataArray, separateModels=False)[0] newDataArray = dataArray.copy() newDataArray['cal'] = m.fast_ccr(d['pos'], d['gam'], d['xsa']) newDataArrays.append(newDataArray) if progress: progress.set(1.0) return fitMetals, newDataArrays
[docs]def metal_standard_deviation(metals, params): """ Calculate the standard deviation in parameters <params> for a list of metal objects <metals>. Parameters ---------- metals : list of Metal objects the metals for which the standard deviation in parameters will be calculated params : list of str the parameters for the standard deviation calculation. For example ['x','y','z','ax','rh','a','b','g','shift'] Returns ------- std_metal : metal object the returned metal object has attributes equal to the standard deviation in the given parameter. All other attributes are zero. """ std_metals = [] all_param_values = [] for metal in metals: all_param_values.append(metal.get_params(params)) std_params = {} for param, values in zip(params, zip(*all_param_values)): std_params[param] = np.std(values) std_metal = metal.__class__(temperature=0.0, B0=0.0) std_metal.set_params(std_params.items()) return std_metal
[docs]def fit_error_models(fittingFunction, **kwargs): """ Perform uncertainty analysis sourcing noise from cooridinates as defined by models of the PDB structure. This function takes a fitting routine <fittingFunction> and repeats it for each model. The standard deviation in the fitted parameters is then returned. Parameters ---------- fittingFunction : function the fitting routine to be used. This could be 'nlr_fit_metal_from_ccr' for example kwargs : dict all key-word arguments will be bundled into this variable and parsed to the fittingFunction. Returns ------- sample_metals : list of list of metals the metals fitted to the data with noise at each iteration std_metals : list of metals the standard deviation in fitted parameters over all iterations of the Monte Carlo simulation. These are stored within the metal object. All unfitted parameters are zero. """ if kwargs.get('ensembleAverage'): raise ValueError("""You cannot define ensemble averaging when sourcing noise from models of the structure as they require separate fits""") fitMetals, _ = fittingFunction(**kwargs) if kwargs.get('progress'): kwargs['progress'].set(1.0) if 'params' not in kwargs: kwargs['params'] = fittingFunction.__defaults__[0] sampleMetals = [m.par['mav'] for m in fitMetals] stdMetals = [] for metals in sampleMetals: stdMetals.append(metal_standard_deviation(metals, kwargs['params'])) return sampleMetals, stdMetals
[docs]def fit_error_monte_carlo(fittingFunction, iterations, **kwargs): """ Perform uncertainty analysis sourcing noise from experimental uncertainties This function takes a fitting routine <fittingFunction> and repeats it for the specified iterations in a Monte-Carlo approach. With each iteration, random noise sourced from a uniform distribution scaled by the experimental uncertainties is added to the experimental values. The standard deviation in the fitted parameters is then returned. NOTE: the 'err' column of the dataArrays must be set to non-zero values for this method to work. Parameters ---------- fittingFunction : function the fitting routine to be used. This could be 'nlr_fit_metal_from_ccr' for example iterations : int the number of iterations for the Monte-Carlo simulation kwargs : dict all key-word arguments will be bundled into this variable and parsed to the fittingFunction. Returns ------- sample_metals : list of list of metals the metals fitted to the data with noise at each iteration std_metals : list of metals the standard deviation in fitted parameters over all iterations of the Monte Carlo simulation. These are stored within the metal object. All unfitted parameters are zero. """ initMetals = kwargs['initMetals'] dataArrays = kwargs['dataArrays'] sampleMetals = [] for i in range(iterations): newInitMetals = [] newDataArrays = [] for m, d in zip(initMetals, dataArrays): newInitMetals.append(m.copy()) d['exp'] += d['err'] * np.random.uniform(low=-1, high=1, size=len(d)) newDataArrays.append(d) kwargs['initMetals'] = newInitMetals kwargs['dataArrays'] = newDataArrays metals, _ = fittingFunction(**kwargs) sampleMetals.append(metals) if kwargs.get('progress'): kwargs['progress'].set(float(i+1)/iterations) sampleMetals = list(zip(*sampleMetals)) stdMetals = [] if 'params' not in kwargs: kwargs['params'] = fittingFunction.__defaults__[0] for metals in sampleMetals: stdMetals.append(metal_standard_deviation(metals, kwargs['params'])) return sampleMetals, stdMetals
[docs]def fit_error_bootstrap(fittingFunction, iterations, fraction, **kwargs): """ Perform uncertainty analysis sourcing noise from fractioning the experimental data. This function takes a fitting routine <fittingFunction> and repeats it for the specified iterations in a Bootstrap approach. With each iteration, a random subset of the experimental data is sampled as specified by the <fraction> argument. The standard deviation in the fitted parameters is then returned. Parameters ---------- fittingFunction : function the fitting routine to be used. This could be 'nlr_fit_metal_from_ccr' for example iterations : int the number of iterations for the Monte-Carlo simulation kwargs : dict all key-word arguments will be bundled into this variable and parsed to the fittingFunction. Returns ------- sample_metals : list of list of metals the metals fitted to the data with noise at each iteration std_metals : list of metals the standard deviation in fitted parameters over all iterations of the Monte Carlo simulation. These are stored within the metal object. All unfitted parameters are zero. """ if not (0.0<fraction<1.0): raise ValueError("The bootstrap sample fraction must be between 0 and 1") initMetals = kwargs['initMetals'] dataArrays = kwargs['dataArrays'] sampleMetals = [] for i in range(iterations): newInitMetals = [] newDataArrays = [] for m, d in zip(initMetals, dataArrays): newInitMetals.append(m.copy()) d = np.random.choice(d, int(fraction*len(d)), replace=False) newDataArrays.append(d) kwargs['initMetals'] = newInitMetals kwargs['dataArrays'] = newDataArrays metals, _ = fittingFunction(**kwargs) sampleMetals.append(metals) if kwargs.get('progress'): kwargs['progress'].set(float(i+1)/iterations) if 'params' not in kwargs: kwargs['params'] = fittingFunction.__defaults__[0] sampleMetals = list(zip(*sampleMetals)) stdMetals = [] for metals in sampleMetals: stdMetals.append(metal_standard_deviation(metals, kwargs['params'])) return sampleMetals, stdMetals
[docs]def qfactor(dataArray, ensembleAverage=False, calDenominator=False): """ Calculate the Q-factor to judge tensor fit quality A lower value indicates a better fit. The Q-factor is calculated using the following equation: .. math:: Q = \\sqrt{ \\frac{\\sum_i\\left[\\left(\\sum_m\\left[ PCS^{exp}_{m,i}-PCS^{calc}_{m,i}\\right]\\right)^2\\right]} {\\sum_i\\left[ \\left(\\sum_m\\left[PCS^{exp}_{m,i}\\right]\\right)^2\\right]} } where :math:`m` and :math:`i` are usually indexed over models and atoms respectively. Parameters ---------- dataArray : numpy array the dataArray must contain the columns 'exp', 'cal' and 'idx' corresponding to the experimenta, calculated and index values respectively. The index value determines the ensemble averaging behaviour, and can be ignored if the argument <ensembleAverage> is False. ensembleAverage : bool, optional when False, the q-factor calculation squares each difference independently. When True, the q-factor calculates an ensemble average before taking the square of differences. The 'idx' column of the dataArray determines the ensemble averaging behaviour with common indices for atoms between models resulting in their summation. calDenominator : bool, optional when False, the standard Q-factor is calculated with only the sum of squares for the experimental values used in the denominator when True, the Q-factor established by Ubbink et al. is calculated which has a sum of absolute values of exp and cal values squared in the denominator. Returns ------- qfactor : float the Q-factor """ if len(dataArray)==0: return np.nan diff = dataArray['exp'] - dataArray['cal'] if ensembleAverage: numer = np.sum(np.bincount(dataArray['idx'], weights=diff)**2) if calDenominator: tmp = np.abs(dataArray['exp']) + np.abs(dataArray['cal']) else: tmp = dataArray['exp'] denom = np.sum(np.bincount(dataArray['idx'], weights=tmp)**2) else: numer = np.sum(diff**2) if calDenominator: tmp = np.abs(dataArray['exp']) + np.abs(dataArray['cal']) else: tmp = dataArray['exp'] denom = np.sum(tmp**2) return (numer/denom)**0.5
[docs]def ensemble_average(dataArray): """ Calculate the ensemble average for the calculated values in the column 'cal' of the argument <dataArray> over models of the PDB file. Ensemble averaging behaviour is determined by the column 'idx' of the input array. Parameters ---------- dataArray : numpy array the input array for ensemble averaging Returns ------- data : numpy array a smaller dataArray with ensemble averaged values """ mdl = sorted(set(dataArray['mdl']))[0] data = dataArray.copy()[dataArray['mdl']==mdl] for row in data: d = dataArray[dataArray['idx']==row['idx']] row['cal'] = np.mean(d['cal']) return data