correldata
Read/write vectors of correlated data from/to a csv file.
These data are stored in a dictionary, whose values are numpy arrays with elements which may be strings, floats, or floats with associated uncertainties as defined in the uncertainties library.
1""" 2Read/write vectors of correlated data from/to a csv file. 3 4These data are stored in a dictionary, whose values are numpy arrays 5with elements which may be strings, floats, or floats with associated uncertainties 6as defined in the [uncertainties](https://pypi.org/project/uncertainties) library. 7""" 8 9 10__author__ = 'Mathieu Daëron' 11__contact__ = 'mathieu@daeron.fr' 12__copyright__ = 'Copyright (c) 2024 Mathieu Daëron' 13__license__ = 'MIT License - https://opensource.org/licenses/MIT' 14__date__ = '2024-10-11' 15__version__ = '1.2.0' 16 17 18import os as _os 19import numpy as _np 20import uncertainties as _uc 21 22from typing import Callable, Hashable, Any 23 24class uarray(_np.ndarray): 25 26 __doc__ = """ 27 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 28 of [ufloat](https://pypi.org/project/uncertainties) values 29 """ 30 31 def __new__(cls, a): 32 obj = _np.asarray(a).view(cls) 33 return obj 34 35 n = property(fget = _np.vectorize(lambda x : x.n)) 36 """Return the array of nominal values (read-only).""" 37 38 s = property(fget = _np.vectorize(lambda x : x.s)) 39 """Return the array of standard errors (read-only)""" 40 41 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 42 """Return the correlation matrix of the array elements (read-only)""" 43 44 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 45 """Return the covariance matrix of the array elements (read-only)""" 46 47 nv = n 48 "Alias for `uarray.nv`" 49 50 se = s 51 "Alias for `uarray.s`" 52 53 cor = correl 54 "Alias for `uarray.correl`" 55 56 cov = covar 57 "Alias for `uarray.covar`" 58 59 60def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 61 ''' 62 Test whether 2-D array `M` is symmetric and positive semidefinite. 63 ''' 64 return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0) 65 66 67def smart_type(x: str): 68 ''' 69 Tries to convert string `x` to a float if it includes a decimal point, or 70 to an integer if it does not. If both attempts fail, return the original 71 string unchanged. 72 ''' 73 try: 74 y = float(x) 75 except ValueError: 76 return x 77 if y % 1 == 0 and '.' not in x: 78 return int(y) 79 return y 80 81 82def read_data(data: str, sep: str = ',', validate_covar: bool = True): 83 ''' 84 Read correlated data from a CSV-like string. 85 86 Column names are interpreted in the following way: 87 * In most cases, each columns is converted to a dict value, with the corresponding 88 dict key being the column's label. 89 * Columns whose label starts with `SE` are interpreted as specifying the standard 90 error for the latest preceding data column. 91 * Columns whose label starts with `correl` are interpreted as specifying the 92 correlation matrix for the latest preceding data column. In that case, column labels 93 are ignored for the rest of the columns belonging to this matrix. 94 * Columns whose label starts with `covar` are interpreted as specifying the 95 covariance matrix for the latest preceding data column. In that case, column labels 96 are ignored for the rest of the columns belonging to this matrix. 97 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 98 the latest preceding data column, by adding an underscore followed by the variable's 99 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 100 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 101 underscore followed by the two variable labels, joined by a second underscore 102 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 103 correspond, respectively, to the lines and columns of this matrix. 104 * Exceptions will be raised, for any given variable: 105 - when specifying both `covar` and any combination of (`SE`, `correl`) 106 - when specifying `correl` without `SE` 107 108 **Arguments** 109 - `data`: a CSV-like string 110 - `sep`: the CSV separator 111 - `validate_covar`: whether to check that the overall covariance matrix 112 is symmetric and positive semidefinite. Specifying `validate_covar = False` 113 bypasses this computationally expensive step. 114 115 **Example** 116 ```py 117 import correldata 118 data = """ 119 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 120 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 121 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 122 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 123 """[1:-1] 124 print(correldata.read_data(data)) 125 126 # yields: 127 # 128 # > { 129 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 130 # 'Tacid': array([90., 90., 90.]), 131 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 132 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 133 # } 134 ``` 135 ''' 136 137 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 138 N = len(data) - 1 139 140 values, se, correl, covar = {}, {}, {}, {} 141 j = 0 142 while j < len(data[0]): 143 field = data[0][j] 144 if not ( 145 field.startswith('SE_') 146 or field.startswith('correl_') 147 or field.startswith('covar_') 148 or field == 'SE' 149 or field == 'correl' 150 or field == 'covar' 151 or len(field) == 0 152 ): 153 values[field] = _np.array([l[j] for l in data[1:]]) 154 j += 1 155 oldfield = field 156 elif field.startswith('SE_'): 157 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 158 j += 1 159 elif field == 'SE': 160 se[oldfield] = _np.array([l[j] for l in data[1:]]) 161 j += 1 162 elif field.startswith('correl_'): 163 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 164 j += N 165 elif field == 'correl': 166 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 167 j += N 168 elif field.startswith('covar_'): 169 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 170 j += N 171 elif field == 'covar': 172 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 173 j += N 174 175 nakedvalues = {} 176 for k in [_ for _ in values]: 177 if ( 178 k not in se 179 and k not in correl 180 and k not in covar 181 ): 182 nakedvalues[k] = values.pop(k) 183 184 for x in values: 185 if x in covar: 186 if x in se: 187 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 188 if x in correl: 189 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 190 if x in correl: 191 if x not in se: 192 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 193 194 for x in correl: 195 if x in values: 196 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 197 else: 198 for x1 in values: 199 for x2 in values: 200 if x == f'{x1}_{x2}': 201 if x1 in se: 202 se1 = se[x1] 203 else: 204 if x1 in covar: 205 se1 = _np.diag(covar[x1])**0.5 206 else: 207 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 208 if x2 in se: 209 se2 = se[x2] 210 else: 211 if x2 in covar: 212 se2 = _np.diag(covar[x2])**0.5 213 else: 214 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 215 216 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 217 218 for x in se: 219 if x in values and x not in correl: 220 covar[x] = _np.diag(se[x]**2) 221 222 for k in [_ for _ in covar]: 223 if k not in values: 224 for j1 in values: 225 for j2 in values: 226 if k == f'{j1}_{j2}': 227 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 228 229 X = _np.array([_ for k in values for _ in values[k]]) 230 CM = _np.zeros((X.size, X.size)) 231 for i, vi in enumerate(values): 232 for j, vj in enumerate(values): 233 if vi == vj: 234 if vi in covar: 235 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 236 else: 237 if f'{vi}_{vj}' in covar: 238 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 239 240 if validate_covar and not is_symmetric_positive_semidefinite(CM): 241 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 242 243 corvalues = uarray(_uc.correlated_values(X, CM)) 244 245 allvalues = nakedvalues 246 247 for i, x in enumerate(values): 248 allvalues[x] = corvalues[i*N:i*N+N] 249 250 return allvalues 251 252 253def read_data_from_file(filename: str | _os.PathLike, **kwargs): 254 ''' 255 Read correlated data from a CSV file. 256 257 **Arguments** 258 - `filename`: `str` or path to the file to read from 259 - `kwargs`: passed to correldata.read_data() 260 ''' 261 with open(filename) as fid: 262 return read_data(fid.read(), **kwargs) 263 264 265def f2s( 266 x: Any, 267 f: (str | Callable | dict), 268 k: Hashable = None, 269 fb: (str | Callable) = 'z.6g', 270) -> str: 271 ''' 272 Format `x` according to format `f` 273 274 * If `f` is a string, return `f'{x:{f}}'` 275 * If `f` is a callable, return `f(x)` 276 * If `f` is a dict and optional argument `k` is a hashable, 277 return f2s(x, f[k]), otherwise return f2s(x, fb) 278 ''' 279 280 if isinstance (x, str): 281 return x 282 if isinstance (f, str): 283 return f'{x:{f}}' 284 if isinstance (f, Callable): 285 return f(x) 286 if isinstance (f, dict): 287 if k in f: 288 return f2s(x, f[k]) 289 if isinstance (fb, str): 290 return f'{x:{fb}}' 291 if isinstance (fb, Callable): 292 return fb(x) 293 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.') 294 295 296 297def data_string( 298 data: dict, 299 sep: str = ',', 300 include_fields: list = None, 301 exclude_fields: list = [], 302 float_format: (str | dict | Callable) = 'z.6g', 303 correl_format: (str | dict | Callable) = 'z.6f', 304 default_float_format: (str | Callable) = 'z.6g', 305 default_correl_format: (str | Callable) = 'z.6f', 306 align: str = '>', 307 atol: float = 1e-12, 308 rtol: float = 1e-12, 309): 310 ''' 311 Generate CSV-like string from correlated data 312 313 **Arguments** 314 - `data`: dict of arrays with strings, floats or correlated data 315 - `sep`: the CSV separator 316 - `include_fields`: subset of fields to write; if `None`, write all fields 317 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 318 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 319 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 320 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 321 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 322 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 323 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 324 missing from `float_format.keys()` will use `default_float_format` instead. 325 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 326 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 327 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 328 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 329 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 330 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 331 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 332 333 334 **Example** 335 336 ```py 337 from correldata import _uc 338 from correldata import _np 339 from correldata import * 340 341 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 342 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 343 344 data = dict(X=X, Y=Y, Z=X+Y) 345 346 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 347 348 # yields: 349 # 350 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 351 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 352 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 353 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 354 ``` 355 ''' 356 if include_fields is None: 357 include_fields = [_ for _ in data] 358 cols, ufields = [], [] 359 for f in include_fields: 360 if f in exclude_fields: 361 continue 362 if isinstance(data[f], uarray): 363 ufields.append(f) 364 N = data[f].size 365 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 366 if f'SE_{f}' not in exclude_fields: 367 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 368 if f'correl_{f}' not in exclude_fields: 369 CM = _uc.correlation_matrix(data[f]) 370 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 371 for i in range(N): 372 cols.append( 373 ['' if i else f'correl_{f}'] 374 + [ 375 f2s( 376 CM[i,j], 377 correl_format, 378 f, 379 default_correl_format, 380 ) 381 for j in range(N) 382 ] 383 ) 384 385 else: 386 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 387 388 for i in range(len(ufields)): 389 for j in range(i): 390 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 391 continue 392 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 393 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 394 for k in range(N): 395 cols.append( 396 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 397 + [ 398 f2s( 399 CM[k,l], 400 correl_format, 401 f, 402 default_correl_format, 403 ) 404 for l in range(N) 405 ] 406 ) 407 408 lines = list(map(list, zip(*cols))) 409 410 if align: 411 lengths = [max([len(e) for e in l]) for l in cols] 412 for l in lines: 413 for k,ln in enumerate(lengths): 414 l[k] = f'{l[k]:{align}{ln}s}' 415 return '\n'.join([(sep+' ').join(l) for l in lines]) 416 417 return '\n'.join([sep.join(l) for l in lines]) 418 419 420 421def save_data_to_file(data, filename, **kwargs): 422 ''' 423 Write correlated data to a CSV file. 424 425 **Arguments** 426 - `data`: dict of arrays with strings, floats or correlated data 427 - `filename`: `str` or path to the file to read from 428 - `kwargs`: passed to correldata.data_string() 429 ''' 430 with open(filename, 'w') as fid: 431 return fid.write(data_string(data, **kwargs))
25class uarray(_np.ndarray): 26 27 __doc__ = """ 28 1-D [ndarray](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html) 29 of [ufloat](https://pypi.org/project/uncertainties) values 30 """ 31 32 def __new__(cls, a): 33 obj = _np.asarray(a).view(cls) 34 return obj 35 36 n = property(fget = _np.vectorize(lambda x : x.n)) 37 """Return the array of nominal values (read-only).""" 38 39 s = property(fget = _np.vectorize(lambda x : x.s)) 40 """Return the array of standard errors (read-only)""" 41 42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x))) 43 """Return the correlation matrix of the array elements (read-only)""" 44 45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x))) 46 """Return the covariance matrix of the array elements (read-only)""" 47 48 nv = n 49 "Alias for `uarray.nv`" 50 51 se = s 52 "Alias for `uarray.s`" 53 54 cor = correl 55 "Alias for `uarray.correl`" 56 57 cov = covar 58 "Alias for `uarray.covar`"
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Return the correlation matrix of the array elements (read-only)
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Return the covariance matrix of the array elements (read-only)
42 correl = property(fget = lambda x: _np.array(_uc.correlation_matrix(x)))
Alias for uarray.correl
45 covar = property(fget = lambda x: _np.array(_uc.covariance_matrix(x)))
Alias for uarray.covar
Inherited Members
- numpy.ndarray
- dumps
- dump
- all
- any
- argmax
- argmin
- argpartition
- argsort
- astype
- byteswap
- choose
- clip
- compress
- conj
- conjugate
- copy
- cumprod
- cumsum
- diagonal
- dot
- fill
- flatten
- getfield
- item
- max
- mean
- min
- nonzero
- partition
- prod
- put
- ravel
- repeat
- reshape
- resize
- round
- searchsorted
- setfield
- setflags
- sort
- squeeze
- std
- sum
- swapaxes
- take
- tobytes
- tofile
- tolist
- tostring
- trace
- transpose
- var
- view
- to_device
- ndim
- flags
- shape
- strides
- data
- itemsize
- size
- nbytes
- base
- dtype
- real
- imag
- flat
- ctypes
- T
- mT
- ptp
- newbyteorder
- itemset
- device
61def is_symmetric_positive_semidefinite(M: _np.ndarray) -> bool: 62 ''' 63 Test whether 2-D array `M` is symmetric and positive semidefinite. 64 ''' 65 return _np.all(_np.linalg.eigvals(M) >= 0) and _np.all(M - M.T == 0)
Test whether 2-D array M
is symmetric and positive semidefinite.
68def smart_type(x: str): 69 ''' 70 Tries to convert string `x` to a float if it includes a decimal point, or 71 to an integer if it does not. If both attempts fail, return the original 72 string unchanged. 73 ''' 74 try: 75 y = float(x) 76 except ValueError: 77 return x 78 if y % 1 == 0 and '.' not in x: 79 return int(y) 80 return y
Tries to convert string x
to a float if it includes a decimal point, or
to an integer if it does not. If both attempts fail, return the original
string unchanged.
83def read_data(data: str, sep: str = ',', validate_covar: bool = True): 84 ''' 85 Read correlated data from a CSV-like string. 86 87 Column names are interpreted in the following way: 88 * In most cases, each columns is converted to a dict value, with the corresponding 89 dict key being the column's label. 90 * Columns whose label starts with `SE` are interpreted as specifying the standard 91 error for the latest preceding data column. 92 * Columns whose label starts with `correl` are interpreted as specifying the 93 correlation matrix for the latest preceding data column. In that case, column labels 94 are ignored for the rest of the columns belonging to this matrix. 95 * Columns whose label starts with `covar` are interpreted as specifying the 96 covariance matrix for the latest preceding data column. In that case, column labels 97 are ignored for the rest of the columns belonging to this matrix. 98 * `SE`, `correl`, and `covar` may be specified for any arbitrary variable other than 99 the latest preceding data column, by adding an underscore followed by the variable's 100 label (ex: `SE_foo`, `correl_bar`, `covar_baz`). 101 * `correl`, and `covar` may also be specified for any pair of variable, by adding an 102 underscore followed by the two variable labels, joined by a second underscore 103 (ex: `correl_foo_bar`, `covar_X_Y`). The elements of the first and second variables 104 correspond, respectively, to the lines and columns of this matrix. 105 * Exceptions will be raised, for any given variable: 106 - when specifying both `covar` and any combination of (`SE`, `correl`) 107 - when specifying `correl` without `SE` 108 109 **Arguments** 110 - `data`: a CSV-like string 111 - `sep`: the CSV separator 112 - `validate_covar`: whether to check that the overall covariance matrix 113 is symmetric and positive semidefinite. Specifying `validate_covar = False` 114 bypasses this computationally expensive step. 115 116 **Example** 117 ```py 118 import correldata 119 data = """ 120 Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48 121 FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0 122 BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0 123 BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5 124 """[1:-1] 125 print(correldata.read_data(data)) 126 127 # yields: 128 # 129 # > { 130 # 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'), 131 # 'Tacid': array([90., 90., 90.]), 132 # 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object), 133 # 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object) 134 # } 135 ``` 136 ''' 137 138 data = [[smart_type(e.strip()) for e in l.split(sep)] for l in data.split('\n')] 139 N = len(data) - 1 140 141 values, se, correl, covar = {}, {}, {}, {} 142 j = 0 143 while j < len(data[0]): 144 field = data[0][j] 145 if not ( 146 field.startswith('SE_') 147 or field.startswith('correl_') 148 or field.startswith('covar_') 149 or field == 'SE' 150 or field == 'correl' 151 or field == 'covar' 152 or len(field) == 0 153 ): 154 values[field] = _np.array([l[j] for l in data[1:]]) 155 j += 1 156 oldfield = field 157 elif field.startswith('SE_'): 158 se[field[3:]] = _np.array([l[j] for l in data[1:]]) 159 j += 1 160 elif field == 'SE': 161 se[oldfield] = _np.array([l[j] for l in data[1:]]) 162 j += 1 163 elif field.startswith('correl_'): 164 correl[field[7:]] = _np.array([l[j:j+N] for l in data[1:]]) 165 j += N 166 elif field == 'correl': 167 correl[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 168 j += N 169 elif field.startswith('covar_'): 170 covar[field[6:]] = _np.array([l[j:j+N] for l in data[1:]]) 171 j += N 172 elif field == 'covar': 173 covar[oldfield] = _np.array([l[j:j+N] for l in data[1:]]) 174 j += N 175 176 nakedvalues = {} 177 for k in [_ for _ in values]: 178 if ( 179 k not in se 180 and k not in correl 181 and k not in covar 182 ): 183 nakedvalues[k] = values.pop(k) 184 185 for x in values: 186 if x in covar: 187 if x in se: 188 raise KeyError(f'Too much information: both SE and covar are specified for variable "{x}".') 189 if x in correl: 190 raise KeyError(f'Too much information: both correl and covar are specified for variable "{x}".') 191 if x in correl: 192 if x not in se: 193 raise KeyError(f'Not enough information: correl is specified without SE for variable "{x}".') 194 195 for x in correl: 196 if x in values: 197 covar[x] = _np.diag(se[x]) @ correl[x] @ _np.diag(se[x]) 198 else: 199 for x1 in values: 200 for x2 in values: 201 if x == f'{x1}_{x2}': 202 if x1 in se: 203 se1 = se[x1] 204 else: 205 if x1 in covar: 206 se1 = _np.diag(covar[x1])**0.5 207 else: 208 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 209 if x2 in se: 210 se2 = se[x2] 211 else: 212 if x2 in covar: 213 se2 = _np.diag(covar[x2])**0.5 214 else: 215 raise KeyError(f'Not enough information: correl_{x} is specified without SE for variable "{x1}".') 216 217 covar[x] = _np.diag(se1) @ correl[x] @ _np.diag(se2) 218 219 for x in se: 220 if x in values and x not in correl: 221 covar[x] = _np.diag(se[x]**2) 222 223 for k in [_ for _ in covar]: 224 if k not in values: 225 for j1 in values: 226 for j2 in values: 227 if k == f'{j1}_{j2}': 228 covar[f'{j2}_{j1}'] = covar[f'{j1}_{j2}'].T 229 230 X = _np.array([_ for k in values for _ in values[k]]) 231 CM = _np.zeros((X.size, X.size)) 232 for i, vi in enumerate(values): 233 for j, vj in enumerate(values): 234 if vi == vj: 235 if vi in covar: 236 CM[N*i:N*i+N,N*j:N*j+N] = covar[vi] 237 else: 238 if f'{vi}_{vj}' in covar: 239 CM[N*i:N*i+N,N*j:N*j+N] = covar[f'{vi}_{vj}'] 240 241 if validate_covar and not is_symmetric_positive_semidefinite(CM): 242 raise _np.linalg.LinAlgError('The complete covariance matrix is not symmetric positive-semidefinite.') 243 244 corvalues = uarray(_uc.correlated_values(X, CM)) 245 246 allvalues = nakedvalues 247 248 for i, x in enumerate(values): 249 allvalues[x] = corvalues[i*N:i*N+N] 250 251 return allvalues
Read correlated data from a CSV-like string.
Column names are interpreted in the following way:
- In most cases, each columns is converted to a dict value, with the corresponding dict key being the column's label.
- Columns whose label starts with
SE
are interpreted as specifying the standard error for the latest preceding data column. - Columns whose label starts with
correl
are interpreted as specifying the correlation matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. - Columns whose label starts with
covar
are interpreted as specifying the covariance matrix for the latest preceding data column. In that case, column labels are ignored for the rest of the columns belonging to this matrix. SE
,correl
, andcovar
may be specified for any arbitrary variable other than the latest preceding data column, by adding an underscore followed by the variable's label (ex:SE_foo
,correl_bar
,covar_baz
).correl
, andcovar
may also be specified for any pair of variable, by adding an underscore followed by the two variable labels, joined by a second underscore (ex:correl_foo_bar
,covar_X_Y
). The elements of the first and second variables correspond, respectively, to the lines and columns of this matrix.- Exceptions will be raised, for any given variable:
- when specifying both
covar
and any combination of (SE
,correl
) - when specifying
correl
withoutSE
- when specifying both
Arguments
data
: a CSV-like stringsep
: the CSV separatorvalidate_covar
: whether to check that the overall covariance matrix is symmetric and positive semidefinite. Specifyingvalidate_covar = False
bypasses this computationally expensive step.
Example
import correldata
data = """
Sample, Tacid, D47, SE, correl,,, D48, covar,,, correl_D47_D48
FOO, 90., .245, .005, 1, 0.5, 0.5, .145, 4e-4, 1e-4, 1e-4, 0.5, 0, 0
BAR, 90., .246, .005, 0.5, 1, 0.5, .146, 1e-4, 4e-4, 1e-4, 0, 0.5, 0
BAZ, 90., .247, .005, 0.5, 0.5, 1, .147, 1e-4, 1e-4, 4e-4, 0, 0, 0.5
"""[1:-1]
print(read_data(data))
# yields:
#
# > {
# 'Sample': array(['FOO', 'BAR', 'BAZ'], dtype='<U3'),
# 'Tacid': array([90., 90., 90.]),
# 'D47': uarray([0.245+/-0.004999999999999998, 0.246+/-0.004999999999999997, 0.247+/-0.005], dtype=object),
# 'D48': uarray([0.145+/-0.019999999999999993, 0.146+/-0.019999999999999993, 0.147+/-0.019999999999999997], dtype=object)
# }
254def read_data_from_file(filename: str | _os.PathLike, **kwargs): 255 ''' 256 Read correlated data from a CSV file. 257 258 **Arguments** 259 - `filename`: `str` or path to the file to read from 260 - `kwargs`: passed to correldata.read_data() 261 ''' 262 with open(filename) as fid: 263 return read_data(fid.read(), **kwargs)
Read correlated data from a CSV file.
Arguments
filename
:str
or path to the file to read fromkwargs
: passed to read_data()
266def f2s( 267 x: Any, 268 f: (str | Callable | dict), 269 k: Hashable = None, 270 fb: (str | Callable) = 'z.6g', 271) -> str: 272 ''' 273 Format `x` according to format `f` 274 275 * If `f` is a string, return `f'{x:{f}}'` 276 * If `f` is a callable, return `f(x)` 277 * If `f` is a dict and optional argument `k` is a hashable, 278 return f2s(x, f[k]), otherwise return f2s(x, fb) 279 ''' 280 281 if isinstance (x, str): 282 return x 283 if isinstance (f, str): 284 return f'{x:{f}}' 285 if isinstance (f, Callable): 286 return f(x) 287 if isinstance (f, dict): 288 if k in f: 289 return f2s(x, f[k]) 290 if isinstance (fb, str): 291 return f'{x:{fb}}' 292 if isinstance (fb, Callable): 293 return fb(x) 294 raise TypeError(f'f2s() formatting argument f = {repr(f)} is neither a string nor a dict nor a callable.')
Format x
according to format f
- If
f
is a string, returnf'{x:{f}}'
- If
f
is a callable, returnf(x)
- If
f
is a dict and optional argumentk
is a hashable, return f2s(x, f[k]), otherwise return f2s(x, fb)
298def data_string( 299 data: dict, 300 sep: str = ',', 301 include_fields: list = None, 302 exclude_fields: list = [], 303 float_format: (str | dict | Callable) = 'z.6g', 304 correl_format: (str | dict | Callable) = 'z.6f', 305 default_float_format: (str | Callable) = 'z.6g', 306 default_correl_format: (str | Callable) = 'z.6f', 307 align: str = '>', 308 atol: float = 1e-12, 309 rtol: float = 1e-12, 310): 311 ''' 312 Generate CSV-like string from correlated data 313 314 **Arguments** 315 - `data`: dict of arrays with strings, floats or correlated data 316 - `sep`: the CSV separator 317 - `include_fields`: subset of fields to write; if `None`, write all fields 318 - `exclude_fields`: subset of fields to ignore (takes precedence over `include_fields`); 319 to exclude only the SE for field `foo`, include `SE_foo`; same goes for `correl_foo` 320 - `float_format`: formatting for float values. May be a string (ex: `'z.3f'`), a callable 321 (ex: `lambda x: '.2f' if x else '0'`), or a dictionary of strings and/or callables, with dict keys 322 corresponding to different fields (ex: `{'foo': '.2e', 'bar': (lambda x: str(x))}`). 323 - `correl_format`: same as `float_format`, but applies to correlation matrix elements 324 - `default_float_format`: only used when `float_format` is a dict; in that case, fields 325 missing from `float_format.keys()` will use `default_float_format` instead. 326 corresponding to different fields (ex: `{'foo': '.2e', 'bar': `lambda x: str(x)`}`). 327 - `default_correl_format`: same as `default_float_format`, but applies to `correl_format` 328 - `align`: right-align (`>`), left-align (`<`), or don't align (empty string) CSV values 329 - `atol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 330 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 331 - `rtol`: passed to [numpy.allclose()](https://numpy.org/doc/stable/reference/generated/numpy.allclose.html) 332 when deciding whether a matrix is equal to the identity matrix or to the zero matrix 333 334 335 **Example** 336 337 ```py 338 from correldata import _uc 339 from correldata import _np 340 from correldata import * 341 342 X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09)) 343 Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16)) 344 345 data = dict(X=X, Y=Y, Z=X+Y) 346 347 print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f')) 348 349 # yields: 350 # 351 # X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, , 352 # 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0 353 # 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0 354 # 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8 355 ``` 356 ''' 357 if include_fields is None: 358 include_fields = [_ for _ in data] 359 cols, ufields = [], [] 360 for f in include_fields: 361 if f in exclude_fields: 362 continue 363 if isinstance(data[f], uarray): 364 ufields.append(f) 365 N = data[f].size 366 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f].n]) 367 if f'SE_{f}' not in exclude_fields: 368 cols.append([f'SE_{f}'] + [f2s(_, float_format, f, default_float_format) for _ in data[f].s]) 369 if f'correl_{f}' not in exclude_fields: 370 CM = _uc.correlation_matrix(data[f]) 371 if not _np.allclose(CM, _np.eye(N), atol = atol, rtol = rtol): 372 for i in range(N): 373 cols.append( 374 ['' if i else f'correl_{f}'] 375 + [ 376 f2s( 377 CM[i,j], 378 correl_format, 379 f, 380 default_correl_format, 381 ) 382 for j in range(N) 383 ] 384 ) 385 386 else: 387 cols.append([f] + [f2s(_, float_format, f, default_float_format) for _ in data[f]]) 388 389 for i in range(len(ufields)): 390 for j in range(i): 391 if f'correl_{ufields[i]}_{ufields[j]}' in exclude_fields or f'correl_{ufields[j]}_{ufields[i]}' in exclude_fields: 392 continue 393 CM = _uc.correlation_matrix((*data[ufields[i]], *data[ufields[j]]))[:N, -N:] 394 if not _np.allclose(CM, _np.zeros((N, N)), atol = atol, rtol = rtol): 395 for k in range(N): 396 cols.append( 397 ['' if k else f'correl_{ufields[j]}_{ufields[i]}'] 398 + [ 399 f2s( 400 CM[k,l], 401 correl_format, 402 f, 403 default_correl_format, 404 ) 405 for l in range(N) 406 ] 407 ) 408 409 lines = list(map(list, zip(*cols))) 410 411 if align: 412 lengths = [max([len(e) for e in l]) for l in cols] 413 for l in lines: 414 for k,ln in enumerate(lengths): 415 l[k] = f'{l[k]:{align}{ln}s}' 416 return '\n'.join([(sep+' ').join(l) for l in lines]) 417 418 return '\n'.join([sep.join(l) for l in lines])
Generate CSV-like string from correlated data
Arguments
data
: dict of arrays with strings, floats or correlated datasep
: the CSV separatorinclude_fields
: subset of fields to write; ifNone
, write all fieldsexclude_fields
: subset of fields to ignore (takes precedence overinclude_fields
); to exclude only the SE for fieldfoo
, includeSE_foo
; same goes forcorrel_foo
float_format
: formatting for float values. May be a string (ex:'z.3f'
), a callable (ex:lambda x: '.2f' if x else '0'
), or a dictionary of strings and/or callables, with dict keys corresponding to different fields (ex:{'foo': '.2e', 'bar': (lambda x: str(x))}
).correl_format
: same asfloat_format
, but applies to correlation matrix elementsdefault_float_format
: only used whenfloat_format
is a dict; in that case, fields missing fromfloat_format.keys()
will usedefault_float_format
instead. corresponding to different fields (ex:{'foo': '.2e', 'bar':
lambda x: str(x)}
).default_correl_format
: same asdefault_float_format
, but applies tocorrel_format
align
: right-align (>
), left-align (<
), or don't align (empty string) CSV valuesatol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrixrtol
: passed to numpy.allclose() when deciding whether a matrix is equal to the identity matrix or to the zero matrix
Example
from correldata import _uc
from correldata import _np
from correldata import *
X = uarray(_uc.correlated_values([1., 2., 3.], _np.eye(3)*0.09))
Y = uarray(_uc.correlated_values([4., 5., 6.], _np.eye(3)*0.16))
data = dict(X=X, Y=Y, Z=X+Y)
print(data_string(data, float_format = 'z.1f', correl_format = 'z.1f'))
# yields:
#
# X, SE_X, Y, SE_Y, Z, SE_Z, correl_X_Z, , , correl_Y_Z, ,
# 1.0, 0.3, 4.0, 0.4, 5.0, 0.5, 0.6, 0.0, 0.0, 0.8, 0.0, 0.0
# 2.0, 0.3, 5.0, 0.4, 7.0, 0.5, 0.0, 0.6, 0.0, 0.0, 0.8, 0.0
# 3.0, 0.3, 6.0, 0.4, 9.0, 0.5, 0.0, 0.0, 0.6, 0.0, 0.0, 0.8
422def save_data_to_file(data, filename, **kwargs): 423 ''' 424 Write correlated data to a CSV file. 425 426 **Arguments** 427 - `data`: dict of arrays with strings, floats or correlated data 428 - `filename`: `str` or path to the file to read from 429 - `kwargs`: passed to correldata.data_string() 430 ''' 431 with open(filename, 'w') as fid: 432 return fid.write(data_string(data, **kwargs))
Write correlated data to a CSV file.
Arguments
data
: dict of arrays with strings, floats or correlated datafilename
:str
or path to the file to read fromkwargs
: passed to data_string()