import math
import os
import numpy as np
import scipy.io
[docs]def power(independence_test, sample_generator, num_samples=100, num_dimensions=1, noise=0.0, repeats=1000, alpha=.05, simulation_type=''):
'''
Estimate the power of an independence test given a simulator to sample from
:param independence_test: an object whose class inherits from the ``Independence_Test`` abstract class
:type independence_test: ``Object(Independence_Test)``
:param sample_generator: a function used to generate simulation from ``simulations`` with parameters given by the following arguments
- ``num_samples``: default to 100
- ``num_dimensions``: default to 1
- ``noise``: default to 0
:type sample_generator: ``FunctionType`` or ``callable()``
:param num_samples: the number of samples generated by the simulation (default to 100)
:type num_samples: int
:param num_dimensions: the number of dimensions of the samples generated by the simulation (default to 1)
:type num_dimensions: int
:param noise: the noise used in simulation (default to 0)
:type noise: float
:param repeats: the number of times we generate new samples to estimate the null/alternative distribution (default to 1000)
:type repeats: int
:param alpha: the type I error level (default to 0.05)
:type alpha: float
:param simulation_type: specify simulation when necessary (default to empty string)
:type simulation_type: string
:return empirical_power: the estimated power
:rtype: numpy.float
**Example**
>>> from mgcpy.benchmarks.power import power
>>> from mgcpy.independence_tests.mgc.mgc import MGC
>>> from mgcpy.benchmarks.simulations import circle_sim
>>> mgc = MGC()
>>> mgc_power = power(mgc, circle_sim, num_samples=100, num_dimensions=2, simulation_type='ellipse')
'''
# test statistics under the null, used to estimate the cutoff value under the null distribution
test_stats_null = np.zeros(repeats)
# test statistic under the alternative
test_stats_alternative = np.zeros(repeats)
for rep in range(repeats):
# generate new samples for each iteration
# the if-else block below is for simulations that have a different argument list
# than the general case
if simulation_type == 'sine_16pi':
matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, period=np.pi*16)
elif simulation_type == 'multi_noise' or simulation_type == 'multi_indept':
matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions)
elif simulation_type == 'ellipse':
matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, radius=5)
elif simulation_type == 'diamond':
matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, period=-np.pi/8)
else:
matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise)
# permutation test
permuted_y = np.random.permutation(matrix_Y)
test_stats_null[rep], _ = independence_test.test_statistic(matrix_X, permuted_y)
test_stats_alternative[rep], _ = independence_test.test_statistic(matrix_X, matrix_Y)
# if the test is pearson, use absolute value of the test statistic
# so the more extreme test statistic is still in a one-sided interval
if independence_test.get_name() == 'pearson':
test_stats_null[rep] = abs(test_stats_null[rep])
test_stats_alternative[rep] = abs(test_stats_alternative[rep])
# the cutoff is determined so that 1-alpha of the test statistics under the null distribution
# is less than the cutoff
cutoff = np.sort(test_stats_null)[math.ceil(repeats*(1-alpha))]
# the proportion of test statistics under the alternative which is no less than the cutoff (in which case
# the null is rejected) is the empirical power
empirical_power = np.where(test_stats_alternative >= cutoff)[0].shape[0] / repeats
return empirical_power
def power_given_data(independence_test, simulation_type, data_type='dimension', num_samples=100, num_dimensions=1, repeats=1000, alpha=.05, additional_params={}):
'''
Estimate the power of an independence test given pre-generated data from the repository ``MGC-paper``
Mostly for internal testing purposes
:param independence_test: an object whose class inherits from the ``Independence_Test`` abstract class
:type independence_test: ``Object(Independence_Test)``
:param simulation_type: specify which simulation is used
:type simulation_type: int within ``[1, 20]``
:param data_type: the pre-generated data is either increasing in dimensions or increasing in sample sizes
:type data_type: string, either 'dimension' or 'sample_size'
:param num_samples: the number of samples generated by the simulation (default to 100)
:type num_samples: int
:param num_dimensions: the number of dimensions of the samples generated by the simulation (default to 1)
:type num_dimensions: int
:param noise: the noise used in simulation (default to 0)
:type noise: float
:param repeats: the number of times we generate new samples to estimate the null/alternative distribution (default to 1000)
:type repeats: int
:param alpha: the type I error level (default to 0.05)
:type alpha: float
:return empirical_power: the estimated power
:rtype: numpy.float
**Example**
>>> from mgcpy.benchmarks.power import power_given_data
>>> from mgcpy.independence_tests.mgc.mgc import MGC
>>> from mgcpy.benchmarks.simulations import circle_sim
>>> mgc = MGC()
>>> mgc_power = power_given_data(mgc, simulation_type=4, num_samples=100, num_dimensions=2)
'''
# test statistics under the null, used to estimate the cutoff value under the null distribution
test_stats_null = np.zeros(repeats)
# test statistic under the alternative
test_stats_alternative = np.zeros(repeats)
# absolute path to the benchmark directory
dir_name = os.path.dirname(__file__)
# load the relevant pre-generated data
if data_type == 'dimension':
file_name_prefix = dir_name + '/sample_data_power_dimensions/type_{}_dim_{}'.format(simulation_type, num_dimensions)
else:
file_name_prefix = dir_name + '/sample_data_power_sample_size/type_{}_size_{}'.format(simulation_type, num_samples)
all_matrix_X = scipy.io.loadmat(file_name_prefix + '_X.mat')['X']
all_matrix_Y = scipy.io.loadmat(file_name_prefix + '_Y.mat')['Y']
for rep in range(repeats):
matrix_X = all_matrix_X[:, :, rep]
matrix_Y = all_matrix_Y[:, :, rep]
# permutation test
permuted_y = np.random.permutation(matrix_Y)
test_stats_null[rep], _ = independence_test.test_statistic(matrix_X, permuted_y, **additional_params)
test_stats_alternative[rep], _ = independence_test.test_statistic(matrix_X, matrix_Y, **additional_params)
# if the test is pearson, use absolute value of the test statistic
# so the more extreme test statistic is still in a one-sided interval
if independence_test.get_name() == 'pearson':
test_stats_null[rep] = abs(test_stats_null[rep])
test_stats_alternative[rep] = abs(test_stats_alternative[rep])
# the cutoff is determined so that 1-alpha of the test statistics under the null distribution
# is less than the cutoff
cutoff = np.sort(test_stats_null)[math.ceil(repeats*(1-alpha))]
# the proportion of test statistics under the alternative which is no less than the cutoff (in which case
# the null is rejected) is the empirical power
empirical_power = np.where(test_stats_alternative >= cutoff)[0].shape[0] / repeats
return empirical_power