Source code for qopen.site

# Copyright 2015-2016 Tom Eulenfeld, MIT license
"""
Align site responses of different runs of Qopen

Consider two runs of Qopen with stations A, B in run 1 and stations
B, C in run 2. Qopen assumes a mean site amplification for each run of 1
with the default configuration.
Use `align_site_responses` or the command line option ``qopen --align-sites``
to correct source power and site amplification factors afterwards.
In the above case site responses and source powers will be adjusted such that
the site response of station B is the same for both runs.
"""

# The following lines are for Py2/Py3 support with the future module.
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from future.builtins import (  # analysis:ignore
    bytes, dict, int, list, object, range, str,
    ascii, chr, hex, input, next, oct, open,
    pow, round, super,
    filter, map, zip)

from collections import defaultdict, OrderedDict
import logging
import warnings

import numpy as np
from obspy.geodetics import gps2dist_azimuth
import scipy

from qopen.source import calculate_source_properties
from qopen.util import gmean


log = logging.getLogger('qopen.site')
log.addHandler(logging.NullHandler())


def _collect_station_coordinates(inventory):
    coords = {}
    for net in inventory.networks:
        for sta in net.stations:
            cha = sta.channels[0]
            lat = cha.latitude or sta.latitude
            lon = cha.longitude or sta.longitude
            key = '%s.%s' % (net.code, sta.code)
            coords[key] = (lat, lon)
    return coords


def _collectR(results, freqi=0, only=None):
    from qopen.core import collect_results
    col = collect_results(results, freqi=freqi, only=['R'])
    R = col['R']
    if only is not None:
        R = {sta: R[sta] for sta in only}
#    for evid, evres in results['events'].items():
#        for sta, Rval in evres['R'].items():
#            if use_only and sta not in use_only:
#                continue
#            R[sta].append(Rval[freqi])
    return R


def _get_number_of_freqs(results):
    Nf = None
    for evid, eres in results['events'].items():
        Nf2 = len(eres['W'])
        if Nf is not None:
            assert Nf == Nf2
        Nf = Nf2
    return Nf


def _rescale_results(results, factors, only=None):
    log.debug('scale events and site responses')
    Nf = _get_number_of_freqs(results)
    for i in range(Nf):
        for k, item in enumerate(results['events'].items()):
            evid, eres = item
            W = eres['W']
            if W[i] is None or np.isnan(W[i]):
                continue
            W[i] /= factors[k, i]
            R = eres['R']
            for sta, Rsta in R.items():
                if Rsta[i] is None or np.isnan(Rsta[i]):
                    continue
                Rsta[i] *= factors[k, i]
                if only and sta not in only:
                    Rsta[i] = None
            if all(Rsta[i] is None or np.isnan(Rsta[i])
                   for Rsta in R.values()):
                W[i] = None


def _Rmean(R):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        Rm = [np.nanmean(np.log(np.array(x, dtype=np.float)))
              for x in R.values()]
    return np.nanmean(Rm)


def _Rstd(R):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        Rstd = [np.nanstd(np.log(np.array(x, dtype=np.float)))
                for x in R.values()]
    return np.nanmean(Rstd)


# http://stackoverflow.com/a/9400562
def _merge_sets(sets):
    newsets, sets = sets, []
    while len(sets) != len(newsets):
        sets, newsets = newsets, []
        for aset in sets:
            for eachset in newsets:
                if not aset.isdisjoint(eachset):
                    eachset.update(aset)
                    break
            else:
                newsets.append(aset)
    return newsets


def _find_unconnected_areas(results, freqi):
    areas = []
    for evid in results['events']:
        R = results['events'][evid]['R']
        area = {sta for sta, Rsta in R.items()
                if Rsta[freqi] is not None and not np.isnan(Rsta[freqi])}
        if len(area) > 0:
            areas.append(area)
    areas = _merge_sets(areas)
    areas = {list(a)[0]: a for a in areas}
    log.info('found %d unconnected areas', len(areas))
    for name in areas:
        stations = areas[name]
        log.debug('area "%s" with %d stations', name, len(stations))
    return areas


def _join_unconnected_areas(areas, max_distance, inventory):
    # At the moment, areas are joined by setting the site response of
    # the station pair with smallest distance to 1
    # Often this works, but sometimes it produces undesired results
    coordinates = _collect_station_coordinates(inventory)
    station_by_coordinate = {c: sta for sta, c in coordinates.items()}
    # reduce number of coordinates in each area
    hulls = {}
    for name in areas:
        points = np.array([coordinates[sta] for sta in areas[name]])
        hull = scipy.spatial.ConvexHull(points)
        hulls[name] = {station_by_coordinate[tuple(p)]
                       for p in points[hull.vertices, :]}
    # calculated distances between unconnected areas
    distance = {}
    for a1 in areas:
        for a2 in areas:
            name = frozenset((a1, a2))
            if name in distance or a1 == a2:
                continue
            dists = {}
            for sta1 in hulls[a1]:
                for sta2 in hulls[a2]:
                    args = coordinates[sta1] + coordinates[sta2]
                    dist = gps2dist_azimuth(*args)[0]
                    dists[(sta1, sta2)] = dist
            mink = min(dists, key=dists.get)
            distance[name] = (dists[mink] / 1e3, mink)
    # join unconnected regions if distance is smaller than max_distance
    near_stations = {}
    while len(distance) > 0:
        nearest_pair = min(distance, key=distance.get)
        dist = distance[nearest_pair][0]
        if dist > max_distance:
            break
        s1, s2 = distance[nearest_pair][1]
        near_stations[s1] = s2
        near_stations[s2] = s1
        a1, a2 = tuple(nearest_pair)
        msg = 'connect areas %s and %s with distance %.1fkm'
        log.debug(msg, a1, a2, dist)
        distance.pop(nearest_pair)
        areas[a1] |= areas.pop(a2)
        hulls[a1] |= hulls.pop(a2)
        for a3 in areas:
            if a3 in (a1, a2):
                continue
            pair1 = frozenset((a1, a3))
            pair2 = frozenset((a2, a3))
            dist1 = distance[pair1]
            dist2 = distance.pop(pair2)
            if dist2[0] < dist1[0]:
                distance[pair1] = dist2
    return areas, near_stations


[docs]def align_site_responses(results, station=None, response=1., use_sparse=True,
                         seismic_moment_method=None,
                         seismic_moment_options=None):
    """
    Align station site responses and correct source parameters (experimental)

    Determine best factor for each event so that site response is the same
    for each station and different events.

    :param results: original result dictionary. For the other options see
        the help for the corresponding command line options or configuration
        parameters.
    :return: corrected result dictionary
    """
    # Ignore not existing event results, sort dict by event id
    results['events'] = OrderedDict(sorted([
        (evid, eres) for (evid, eres) in results['events'].items()
        if eres is not None], key=lambda x: x[0]))
    join_unconnected = None
    if join_unconnected:
        inventory = None
        msg = 'This feature needs more work and tests'
        raise NotImplementedError(msg)
    Ne = len(results['events'])
    if Ne == 1:
        use_sparse = False
    # Determine number of freqs
    Nf = _get_number_of_freqs(results)
    # Determine number of events at stations for each freq band
    Nstations = [defaultdict(int) for i in range(Nf)]
    for evid, eres in results['events'].items():
        for i in range(Nf):
            for sta, Rsta in eres['R'].items():
                Rsta = Rsta[i]
                if Rsta is None or np.isnan(Rsta):
                    continue
                Nstations[i][sta] += 1

    def construct_ols(coldata, b_val):
        # b, row and Arepr are nonlocal lists
        b.append(b_val)
        if use_sparse:
            for col, data in coldata:
                Arepr[0].append(data)
                Arepr[1][0].append(row[0])
                Arepr[1][1].append(col)
        else:
            Arow = np.zeros(Ne)
            for col, data in coldata:
                Arow[col] = data
            Arepr.append(Arow)
        row[0] += 1

    # calculate best factors for each freq band with OLS A*factor=b
    factors = np.empty((Ne, Nf))
    std_before = []
    for i in range(Nf):
        log.debug('align sites for freq no. %d', i)
        # find unconnected areas
        areas = _find_unconnected_areas(results, i)
        if join_unconnected:
            areas, near_stations = _join_unconnected_areas(
                areas, join_unconnected, inventory)
        largest_area = max(areas, key=lambda k: len(areas[k]))
        msg = 'use largest area %s with %d stations'
        log.info(msg, largest_area, len(areas[largest_area]))
        largest_area = areas[largest_area]
        R = _collectR(results, freqi=i, only=largest_area)
        std_before.append(_Rstd(R))
        row = [0]
        b = []
        if use_sparse:
            Arepr = [[], [[], []]]
        else:
            Arepr = []
        norm_row_A = defaultdict(float)
        norm_row_b = np.log(response)
        first = {}
        last = {}
        # add pairs of site responses for one station and different events
        for k, item in enumerate(results['events'].items()):
            evid, eres = item
            for sta, Rsta in eres['R'].items():
                Rsta = Rsta[i]
                if sta not in largest_area:
                    continue
                if Rsta is None or np.isnan(Rsta):
                    continue
                if station is None:
                    # collect information if product of station site responses
                    # is to be normalized
                    fac = 1. / Nstations[i][sta] / len(largest_area)
                    norm_row_A[k] += fac
                    norm_row_b -= np.log(Rsta) * fac
                elif station == sta:
                    # pin site response of specific station
                    fac = 1. / Nstations[i][sta]
                    norm_row_A[k] += fac
                    norm_row_b -= np.log(Rsta) * fac
                if sta in last:
                    # add pairs of site responses for one station
                    # and two different events
                    kl, Rstal = last[sta]
                    b_val = np.log(Rstal) - np.log(Rsta)
                    construct_ols(((k, 1), (kl, -1)), b_val)
                    last[sta] = k, Rsta
                elif (join_unconnected and sta in near_stations.keys() and
                        near_stations[sta] in last):
                    # add pairs of site responses for two nearby stations
                    # (in two previously unconnected areas)
                    # and two different events
                    kl, Rstal = last[near_stations[sta]]
                    b_val = np.log(Rstal) - np.log(Rsta)
                    construct_ols(((k, 1), (kl, -1)), b_val)
                    last[sta] = k, Rsta
                else:
                    last[sta] = first[sta] = (k, Rsta)
        # pin mean site response or site response of specific station
        construct_ols(norm_row_A.items(), norm_row_b)
        msg = 'constructed %scoefficient matrix with shape (%d, %d)'
        log.debug(msg, 'sparse ' * use_sparse, row[0], Ne)
        # solve least squares system
        b = np.array(b)
        if use_sparse:
            A = scipy.sparse.csr_matrix(tuple(Arepr), shape=(row[0], Ne))
            res = scipy.sparse.linalg.lsmr(A, b, atol=1e-8)
        else:
            A = np.array(Arepr)
            res = scipy.linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
        factors[:, i] = np.exp(res[0])
    # Scale W and R
    _rescale_results(results, factors, only=largest_area)
    # Calculate omM, M0 and m again
    calculate_source_properties(
        results, seismic_moment_method=seismic_moment_method,
        seismic_moment_options=seismic_moment_options)
    # Calculate mean Rs again, use robust mean here
    results.setdefault('R', {})
    for st, Rst in _collectR(results, freqi=None).items():
        if not np.all(np.isnan(Rst)):
            results['R'][st] = gmean(Rst, axis=0, robust=True).tolist()
    std_after = []
    for i in range(Nf):
        R = _collectR(results, freqi=i)
        std_after.append(_Rstd(R))
    msg = ('aligned sites for all frequencies, reduction of mean stdev: ' +
           ', '.join(Nf * ['%.2g']) + ' -> ' + ', '.join(Nf * ['%.2g']))
    log.info(msg, *(std_before + std_after))
    return results