Source code for mgcpy.benchmarks.power

import math
import os

import numpy as np
import scipy.io


[docs]def power(independence_test, sample_generator, num_samples=100, num_dimensions=1, noise=0.0, repeats=1000, alpha=.05, simulation_type=''):
    '''
    Estimate the power of an independence test given a simulator to sample from

    :param independence_test: an object whose class inherits from the ``Independence_Test`` abstract class
    :type independence_test: ``Object(Independence_Test)``

    :param sample_generator: a function used to generate simulation from ``simulations`` with parameters given by the following arguments
        - ``num_samples``: default to 100
        - ``num_dimensions``: default to 1
        - ``noise``: default to 0
    :type sample_generator: ``FunctionType`` or ``callable()``

    :param num_samples: the number of samples generated by the simulation (default to 100)
    :type num_samples: int

    :param num_dimensions: the number of dimensions of the samples generated by the simulation (default to 1)
    :type num_dimensions: int

    :param noise: the noise used in simulation (default to 0)
    :type noise: float

    :param repeats: the number of times we generate new samples to estimate the null/alternative distribution (default to 1000)
    :type repeats: int

    :param alpha: the type I error level (default to 0.05)
    :type alpha: float

    :param simulation_type: specify simulation when necessary (default to empty string)
    :type simulation_type: string

    :return empirical_power: the estimated power
    :rtype: numpy.float

    **Example**

    >>> from mgcpy.benchmarks.power import power
    >>> from mgcpy.independence_tests.mgc.mgc import MGC
    >>> from mgcpy.benchmarks.simulations import circle_sim
    >>> mgc = MGC()
    >>> mgc_power = power(mgc, circle_sim, num_samples=100, num_dimensions=2, simulation_type='ellipse')
    '''

    # test statistics under the null, used to estimate the cutoff value under the null distribution
    test_stats_null = np.zeros(repeats)
    # test statistic under the alternative
    test_stats_alternative = np.zeros(repeats)
    for rep in range(repeats):
        # generate new samples for each iteration
        # the if-else block below is for simulations that have a different argument list
        # than the general case
        if simulation_type == 'sine_16pi':
            matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, period=np.pi*16)
        elif simulation_type == 'multi_noise' or simulation_type == 'multi_indept':
            matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions)
        elif simulation_type == 'ellipse':
            matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, radius=5)
        elif simulation_type == 'diamond':
            matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise, period=-np.pi/8)
        else:
            matrix_X, matrix_Y = sample_generator(num_samples, num_dimensions, noise=noise)

        # permutation test
        permuted_y = np.random.permutation(matrix_Y)
        test_stats_null[rep], _ = independence_test.test_statistic(matrix_X, permuted_y)
        test_stats_alternative[rep], _ = independence_test.test_statistic(matrix_X, matrix_Y)

        # if the test is pearson, use absolute value of the test statistic
        # so the more extreme test statistic is still in a one-sided interval
        if independence_test.get_name() == 'pearson':
            test_stats_null[rep] = abs(test_stats_null[rep])
            test_stats_alternative[rep] = abs(test_stats_alternative[rep])

    # the cutoff is determined so that 1-alpha of the test statistics under the null distribution
    # is less than the cutoff
    cutoff = np.sort(test_stats_null)[math.ceil(repeats*(1-alpha))]
    # the proportion of test statistics under the alternative which is no less than the cutoff (in which case
    # the null is rejected) is the empirical power
    empirical_power = np.where(test_stats_alternative >= cutoff)[0].shape[0] / repeats
    return empirical_power


def power_given_data(independence_test, simulation_type, data_type='dimension', num_samples=100, num_dimensions=1, repeats=1000, alpha=.05, additional_params={}):
    '''
    Estimate the power of an independence test given pre-generated data from the repository ``MGC-paper``
    Mostly for internal testing purposes

    :param independence_test: an object whose class inherits from the ``Independence_Test`` abstract class
    :type independence_test: ``Object(Independence_Test)``

    :param simulation_type: specify which simulation is used
    :type simulation_type: int within ``[1, 20]``

    :param data_type: the pre-generated data is either increasing in dimensions or increasing in sample sizes
    :type data_type: string, either 'dimension' or 'sample_size'

    :param num_samples: the number of samples generated by the simulation (default to 100)
    :type num_samples: int

    :param num_dimensions: the number of dimensions of the samples generated by the simulation (default to 1)
    :type num_dimensions: int

    :param noise: the noise used in simulation (default to 0)
    :type noise: float

    :param repeats: the number of times we generate new samples to estimate the null/alternative distribution (default to 1000)
    :type repeats: int

    :param alpha: the type I error level (default to 0.05)
    :type alpha: float

    :return empirical_power: the estimated power
    :rtype: numpy.float

    **Example**

    >>> from mgcpy.benchmarks.power import power_given_data
    >>> from mgcpy.independence_tests.mgc.mgc import MGC
    >>> from mgcpy.benchmarks.simulations import circle_sim
    >>> mgc = MGC()
    >>> mgc_power = power_given_data(mgc, simulation_type=4, num_samples=100, num_dimensions=2)
    '''
    # test statistics under the null, used to estimate the cutoff value under the null distribution
    test_stats_null = np.zeros(repeats)
    # test statistic under the alternative
    test_stats_alternative = np.zeros(repeats)
    # absolute path to the benchmark directory
    dir_name = os.path.dirname(__file__)
    # load the relevant pre-generated data
    if data_type == 'dimension':
        file_name_prefix = dir_name + '/sample_data_power_dimensions/type_{}_dim_{}'.format(simulation_type, num_dimensions)
    else:
        file_name_prefix = dir_name + '/sample_data_power_sample_size/type_{}_size_{}'.format(simulation_type, num_samples)
    all_matrix_X = scipy.io.loadmat(file_name_prefix + '_X.mat')['X']
    all_matrix_Y = scipy.io.loadmat(file_name_prefix + '_Y.mat')['Y']
    for rep in range(repeats):
        matrix_X = all_matrix_X[:, :, rep]
        matrix_Y = all_matrix_Y[:, :, rep]
        # permutation test
        permuted_y = np.random.permutation(matrix_Y)
        test_stats_null[rep], _ = independence_test.test_statistic(matrix_X, permuted_y, **additional_params)
        test_stats_alternative[rep], _ = independence_test.test_statistic(matrix_X, matrix_Y, **additional_params)

        # if the test is pearson, use absolute value of the test statistic
        # so the more extreme test statistic is still in a one-sided interval
        if independence_test.get_name() == 'pearson':
            test_stats_null[rep] = abs(test_stats_null[rep])
            test_stats_alternative[rep] = abs(test_stats_alternative[rep])

    # the cutoff is determined so that 1-alpha of the test statistics under the null distribution
    # is less than the cutoff
    cutoff = np.sort(test_stats_null)[math.ceil(repeats*(1-alpha))]
    # the proportion of test statistics under the alternative which is no less than the cutoff (in which case
    # the null is rejected) is the empirical power
    empirical_power = np.where(test_stats_alternative >= cutoff)[0].shape[0] / repeats
    return empirical_power