Source code for process_jsonfile

"""
An example of how to extract information from a BASTA .json file.

IMPORTANT: Please run an example producing a json-file to run this tutorial!!
"""
import os
import sys

import h5py
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from scipy.ndimage import gaussian_filter

import basta.fileio as fio
from basta.downloader import get_basta_dir

# Definition of the path to BASTA, just in case you need it
BASTADIR = get_basta_dir()



[docs]
def extract_from_json(jsonfile, gridfile, parameter):
    """
    Extract information from a BASTA json file.

    During a fit, if "optionaloutputs" is True, a dump of the statistics for each star
    will be stored in a .json file. The grid used to perform the fit is required to
    obtain any useful information from the json-dump.

    Parameters
    ----------
    jsonfile : str
        Path to the .json file from the BASTA fit

    gridfile : str
        Path to the grid used to compute the fit

    parameter : str
        The parameter values to extract (must be a valid name!)

    Returns
    -------
    parametervals : array_like
        Values of the given parameter for all models (with non-zero likelihood) in the
        grid

    loglikes : array_like
        The log(likelihood) of all models in the grid, excluding zero-likelihood models

    chi2 : array_like
        The chi**2 values of all models (with non-zero likelihood) in the grid
    """
    # The json file can be read into selectedmodels (in BASTA lingo), which contains
    # chi2 and log-likelihood (called logPDF internally) of all models in the grid. It
    # is stored by track (see below).
    selectedmodels = fio.load_selectedmodels(jsonfile)

    # Typically, we want to extract the likelihoods and the values of a given parameter
    # for all models in the grid/fit
    with h5py.File(gridfile, "r") as grid:
        parametervals = []
        loglikes = []
        chi2s = []

        # In selectedmodels, the information is accessed by looping over the tracks
        for trackpath, trackstats in sorted(selectedmodels.items()):
            # The log-likelihood (called logPDF) and chi2 can be stored directly
            loglikes.append(trackstats.logPDF)
            chi2s.append(trackstats.chi2)

            # The corresponding parameter values can be extracted from the grid
            gridval = grid["{0}/{1}".format(trackpath, parameter)]
            parametervals.append(gridval[trackstats.index])

        # After extraction it is useful to collapse the lists into numpy arrays
        parametervals = np.concatenate(parametervals)
        loglikes = np.concatenate(loglikes)
        chi2s = np.concatenate(chi2s)

    return parametervals, loglikes, chi2s




[docs]
def sample_posterior(vals, logys):
    """
    Computation of the posterior of a parameter by numerical sampling. Duplication of
    how it is performed in basta/process_output.py.

    Parameters
    ----------
    vals : array_like
        Parameter values

    logys : array_like
        Corresponding log(likelihood)'s

    Returns
    ------
    samples : array_like
        Samples of the posterior. Can be used to make the posterior distribution.
    """
    lk = logys - np.amax(logys)
    post = np.exp(lk - np.log(np.sum(np.exp(lk))))
    nsamples = min(100000, len(logys))
    sampled_indices = np.random.choice(a=np.arange(len(post)), p=post, size=nsamples)
    return vals[sampled_indices]




[docs]
def main():
    """
    An example of how to extract information from a BASTA .json file. Will be plotted
    as a simple histogram and as a KDE-representation (as BASTA uses for the corner
    plots). Uses the auxiliary functions above.

    IMPORTANT: Please run the example in examples/xmlinput/{create_inputfile_json.py,
               input_json.xml} or the template examples/{create_inputfile.py,
               input_myfit.xml} produce the input files required by this module!

    Parameters
    ----------
    None

    Returns
    -------
    None
    """
    # Init
    plt.close("all")
    outdir = os.path.join("output", "json-analysis")
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # Extract a given parameter and corresponding statistics
    param = "massini"
    infile = os.path.join("output", "json", "16CygA.json")
    gridf = os.path.join(BASTADIR, "grids", "Garstec_16CygA.hdf5")
    print(f"Using statistics from '{infile}' with grid from '{gridf}'!\n")
    print("Loading stats and grid ... ", end="", flush=True)
    try:
        paramvals, loglike, _ = extract_from_json(
            jsonfile=infile, gridfile=gridf, parameter=param
        )
    except FileNotFoundError:
        print(f"\nCannot read '{infile}'! Did you run 'xmlinput/input_json.xml'?")
        infile = os.path.join("output", "myfit", "16CygA.json")
        print(f"Trying '{infile}' instead ... ", end="", flush=True)
        try:
            paramvals, loglike, _ = extract_from_json(
                jsonfile=infile, gridfile=gridf, parameter=param
            )
        except FileNotFoundError:
            print("\nCannot read that either! Aborting!!")
            sys.exit(1)
    print("Done!", flush=True)

    # The extracted information can then be used to, e.g., sample the posterior
    print("Sampling the posterior ... ", end="", flush=True)
    samples = sample_posterior(vals=paramvals, logys=loglike)
    print("Done!", flush=True)

    # ... which can then be plotted in a simple, smoothed histogram
    print("Creating posterior histogram ... ", end="", flush=True)
    _, ax = plt.subplots()
    counts, bins = np.histogram(a=samples, bins=100)  # Fixed bins
    counts = gaussian_filter(input=counts, sigma=2)  # Slightly smoothed
    x0 = np.array(list(zip(bins[:-1], bins[1:]))).flatten()
    y0 = np.array(list(zip(counts, counts))).flatten()
    ax.plot(x0, y0, label="Smoothed posterior histogram (100 bins)")
    ax.set_xlabel(param)
    ax.set_ylabel("Counts")
    ax.legend(loc="best", facecolor="none", edgecolor="none", fontsize="small")
    plt.savefig(
        os.path.join(outdir, f"posterior_{param}_simple.pdf"), bbox_inches="tight"
    )
    print("Done!", flush=True)

    # ... or in a KDE smoothed version, like the BASTA cornerplots
    print("Creating posterior KDE ... ", end="", flush=True)
    histargs = {"color": "tab:cyan", "alpha": 0.17}
    histargs_line = {"color": "tab:orange", "alpha": 0.7}
    histargs_fill = {"color": "tab:orange", "alpha": 0.1}
    _, ax = plt.subplots()
    counts, bins = np.histogram(a=samples, bins="auto")
    counts = gaussian_filter(input=counts, sigma=1)
    kernel = gaussian_kde(samples, bw_method="silverman")
    x0 = np.linspace(np.amin(samples), np.amax(samples), num=500)
    y0 = kernel(x0)
    y0 /= np.amax(y0)
    x0_hist = np.array(list(zip(bins[:-1], bins[1:]))).flatten()
    y0_hist = np.array(list(zip(counts, counts))).flatten() / np.amax(counts)
    ax.fill_between(
        x0_hist, y0_hist, interpolate=True, label="Posterior histogram", **histargs
    )
    ax.plot(x0, y0, label="Posterior KDE", **histargs_line)
    ax.fill_between(x0, y0, interpolate=True, **histargs_fill)
    ax.set_xlabel(param)
    ax.set_ylabel("Density")
    ax.legend(loc="best", facecolor="none", edgecolor="none", fontsize="small")
    ax.get_yaxis().set_ticks([])
    plt.savefig(os.path.join(outdir, f"posterior_{param}.pdf"), bbox_inches="tight")
    print("Done!", flush=True)

    # Done!
    print(f"\nPlots saved to '{outdir}'!")



if __name__ == "__main__":
    main()