Source code for qopen.site

# Copyright 2015-2017 Tom Eulenfeld, MIT license
"""
Align site responses of different runs of Qopen

Consider two runs of Qopen with stations A, B in run 1 and stations
B, C in run 2. Qopen assumes a mean site amplification for each run of 1
with the default configuration.
Use `align_site_responses` or the command line option ``qopen --align-sites``
to correct source power and site amplification factors afterwards.
In the above case site responses and source powers will be adjusted such that
the site response of station B is the same for both runs.
"""

from collections import defaultdict, OrderedDict
import logging
import warnings

import numpy as np
from obspy.geodetics import gps2dist_azimuth
import scipy

from qopen.source import calculate_source_properties
from qopen.util import gmeanlist


log = logging.getLogger('qopen.site')
log.addHandler(logging.NullHandler())


def _collect_station_coordinates(inventory):
    coords = {}
    for net in inventory.networks:
        for sta in net.stations:
            cha = sta.channels[0]
            lat = cha.latitude or sta.latitude
            lon = cha.longitude or sta.longitude
            key = '%s.%s' % (net.code, sta.code)
            coords[key] = (lat, lon)
    return coords


def _collectR(results, freqi=0, only=None):
    from qopen.core import collect_results
    col = collect_results(results, freqi=freqi, only=['R'])
    R = col['R']
    if only is not None:
        R = {sta: R[sta] for sta in only}
#    for evid, evres in results['events'].items():
#        for sta, Rval in evres['R'].items():
#            if use_only and sta not in use_only:
#                continue
#            R[sta].append(Rval[freqi])
    return R


def _get_number_of_freqs(results):
    Nf = None
    for evid, eres in results['events'].items():
        Nf2 = len(eres['W'])
        if Nf is not None:
            assert Nf == Nf2
        Nf = Nf2
    return Nf


def _rescale_results(results, factors, only=None):
    log.debug('scale events and site responses')
    Nf = _get_number_of_freqs(results)
    for i in range(Nf):
        for k, item in enumerate(results['events'].items()):
            evid, eres = item
            W = eres['W']
            if W[i] is None or np.isnan(W[i]):
                continue
            W[i] /= factors[k, i]
            R = eres['R']
            for sta, Rsta in R.items():
                if Rsta[i] is None or np.isnan(Rsta[i]):
                    continue
                Rsta[i] *= factors[k, i]
                if only and sta not in only[i]:
                    Rsta[i] = None
            if all(Rsta[i] is None or np.isnan(Rsta[i])
                   for Rsta in R.values()):
                W[i] = None


def _Rmean(R):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        Rm = [np.nanmean(np.log(np.array(x, dtype=float)))
              for x in R.values()]
    return np.nanmean(Rm)


def _Rstd(R):
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        Rstd = [np.nanstd(np.log(np.array(x, dtype=float)))
                for x in R.values()]
    return np.nanmean(Rstd)


# http://stackoverflow.com/a/9400562
def _merge_sets(sets):
    newsets, sets = sets, []
    while len(sets) != len(newsets):
        sets, newsets = newsets, []
        for aset in sets:
            for eachset in newsets:
                if not aset.isdisjoint(eachset):
                    eachset.update(aset)
                    break
            else:
                newsets.append(aset)
    return newsets


def _find_unconnected_areas(results, freqi, ignore_stations=None):
    areas = []
    for evid in results['events']:
        R = results['events'][evid]['R']
        area = {sta for sta, Rsta in R.items()
                if Rsta[freqi] is not None and not np.isnan(Rsta[freqi]) and
                (ignore_stations is None or sta not in ignore_stations)}
        if len(area) > 0:
            areas.append(area)
    areas = _merge_sets(areas)
    areas = {list(a)[0]: a for a in areas}
    log.log(logging.WARNING if len(areas) == 0 else logging.INFO,
            'found %d unconnected areas', len(areas))
    for name in areas:
        stations = areas[name]
        log.debug('area "%s" with %d stations', name, len(stations))
    return areas


def _join_unconnected_areas(areas, max_distance, inventory):
    # At the moment, areas are joined by setting the site response of
    # the station pair with smallest distance to 1
    # Often this works, but sometimes it produces undesired results
    coordinates = _collect_station_coordinates(inventory)
    station_by_coordinate = {c: sta for sta, c in coordinates.items()}
    # reduce number of coordinates in each area
    hulls = {}
    for name in areas:
        points = np.array([coordinates[sta] for sta in areas[name]])
        hull = scipy.spatial.ConvexHull(points)
        hulls[name] = {station_by_coordinate[tuple(p)]
                       for p in points[hull.vertices, :]}
    # calculated distances between unconnected areas
    distance = {}
    for a1 in areas:
        for a2 in areas:
            name = frozenset((a1, a2))
            if name in distance or a1 == a2:
                continue
            dists = {}
            for sta1 in hulls[a1]:
                for sta2 in hulls[a2]:
                    args = coordinates[sta1] + coordinates[sta2]
                    dist = gps2dist_azimuth(*args)[0]
                    dists[(sta1, sta2)] = dist
            mink = min(dists, key=dists.get)
            distance[name] = (dists[mink] / 1e3, mink)
    # join unconnected regions if distance is smaller than max_distance
    near_stations = {}
    while len(distance) > 0:
        nearest_pair = min(distance, key=distance.get)
        dist = distance[nearest_pair][0]
        if dist > max_distance:
            break
        s1, s2 = distance[nearest_pair][1]
        near_stations[s1] = s2
        near_stations[s2] = s1
        a1, a2 = tuple(nearest_pair)
        msg = 'connect areas %s and %s with distance %.1fkm'
        log.debug(msg, a1, a2, dist)
        distance.pop(nearest_pair)
        areas[a1] |= areas.pop(a2)
        hulls[a1] |= hulls.pop(a2)
        for a3 in areas:
            if a3 in (a1, a2):
                continue
            pair1 = frozenset((a1, a3))
            pair2 = frozenset((a2, a3))
            dist1 = distance[pair1]
            dist2 = distance.pop(pair2)
            if dist2[0] < dist1[0]:
                distance[pair1] = dist2
    return areas, near_stations


[docs]def align_site_responses(results, station=None, response=1., use_sparse=True,
                         seismic_moment_method=None,
                         seismic_moment_options=None,
                         ignore_stations=None):
    """
    Align station site responses (amplification) and correct source parameters

    Determine best factor for each event so that site response is the same
    for each station and different events.

    :param results: original result dictionary. For the other options see
        the help for the corresponding command line options or configuration
        parameters.
    :return: corrected result dictionary
    """
    # Ignore not existing event results, sort dict by event id
    results['events'] = OrderedDict(sorted([
        (evid, eres) for (evid, eres) in results['events'].items()
        if eres is not None], key=lambda x: x[0]))
    join_unconnected = None
    if join_unconnected:
        inventory = None
        msg = 'This feature needs more work and tests'
        raise NotImplementedError(msg)
    Ne = len(results['events'])
    if Ne == 1:
        use_sparse = False
    # Determine number of freqs
    Nf = _get_number_of_freqs(results)
    # Determine number of events at stations for each freq band
    Nstations = [defaultdict(int) for i in range(Nf)]
    for evid, eres in results['events'].items():
        for i in range(Nf):
            for sta, Rsta in eres['R'].items():
                Rsta = Rsta[i]
                if Rsta is None or np.isnan(Rsta):
                    continue
                Nstations[i][sta] += 1

    def construct_ols(coldata, b_val):
        # b, row and Arepr are nonlocal lists
        b.append(b_val)
        if use_sparse:
            for col, data in coldata:
                Arepr[0].append(data)
                Arepr[1][0].append(row[0])
                Arepr[1][1].append(col)
        else:
            Arow = np.zeros(Ne)
            for col, data in coldata:
                Arow[col] = data
            Arepr.append(Arow)
        row[0] += 1
    # calculate best factors for each freq band with OLS A*factor=b
    reponse_freqs = [response] * Nf if isinstance(response, (float, int)) else response
    factors = np.ones((Ne, Nf))
    std_before = []
    # one for each freq
    largest_areas = []
    for i in range(Nf):
        log.info('align sites for freq no. %d', i)
        # find unconnected areas
        areas = _find_unconnected_areas(results, i,
                                        ignore_stations=ignore_stations)
        if join_unconnected:
            areas, near_stations = _join_unconnected_areas(
                areas, join_unconnected, inventory)
        if len(areas) == 0:
            largest_areas.append(None)
            std_before.append(np.nan)
            continue
        largest_area = max(areas, key=lambda k: len(areas[k]))
        msg = 'use largest area %s with %d stations'
        log.debug(msg, largest_area, len(areas[largest_area]))
        largest_area = areas[largest_area]
        largest_areas.append(largest_area)
        R = _collectR(results, freqi=i, only=largest_area)
        std_before.append(_Rstd(R))
        row = [0]
        b = []
        if use_sparse:
            Arepr = [[], [[], []]]
        else:
            Arepr = []
        norm_row_A = defaultdict(float)
        norm_row_b = 0
        first = {}
        last = {}
        stations_used_norm = set()
        # add pairs of site responses for one station and different events
        for k, item in enumerate(results['events'].items()):
            evid, eres = item
            for sta, Rsta in eres['R'].items():
                Rsta = Rsta[i]
                if sta not in largest_area:
                    continue
                if Rsta is None or np.isnan(Rsta):
                    continue
                if station is None or sta == station or sta in station:
                    # collect information for normalization
                    stations_used_norm.add(sta)
                    fac = 1. / Nstations[i][sta]
                    norm_row_A[k] += fac
                    norm_row_b -= np.log(Rsta) * fac
                if sta in last:
                    # add pairs of site responses for one station
                    # and two different events
                    kl, Rstal = last[sta]
                    b_val = np.log(Rstal) - np.log(Rsta)
                    construct_ols(((k, 1), (kl, -1)), b_val)
                    last[sta] = k, Rsta
                elif (join_unconnected and sta in near_stations.keys() and
                        near_stations[sta] in last):
                    # add pairs of site responses for two nearby stations
                    # (in two previously unconnected areas)
                    # and two different events
                    kl, Rstal = last[near_stations[sta]]
                    b_val = np.log(Rstal) - np.log(Rsta)
                    construct_ols(((k, 1), (kl, -1)), b_val)
                    last[sta] = k, Rsta
                else:
                    last[sta] = first[sta] = (k, Rsta)
        # pin mean site response or site response of specific station(s)
        msg = 'use geometric mean of %d stations for normalization'
        log.debug(msg, len(stations_used_norm))
        norm_row_b = norm_row_b / len(stations_used_norm) + np.log(reponse_freqs[i])
        for k in norm_row_A:
            norm_row_A[k] /= len(stations_used_norm)
        construct_ols(norm_row_A.items(), norm_row_b)
        msg = 'constructed %scoefficient matrix with shape (%d, %d)'
        log.debug(msg, 'sparse ' * use_sparse, row[0], Ne)
        # solve least squares system
        b = np.array(b)
        if use_sparse:
            A = scipy.sparse.csr_matrix(tuple(Arepr), shape=(row[0], Ne))
            res = scipy.sparse.linalg.lsmr(A, b, atol=1e-9)
        else:
            A = np.array(Arepr)
            res = scipy.linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
        factors[:, i] = np.exp(res[0])

    # Scale W and R
    _rescale_results(results, factors, only=largest_areas)
    # Calculate sds, M0 and m again
    calculate_source_properties(
        results, seismic_moment_method=seismic_moment_method,
        seismic_moment_options=seismic_moment_options)
    # Calculate mean Rs again, use robust mean here
    results.setdefault('R', {})
    for st, Rst in _collectR(results, freqi=None).items():
        if not np.all(np.isnan(Rst)):
            results['R'][st] = gmeanlist(Rst, axis=0, robust=True)
    std_after = []
    for i in range(Nf):
        R = _collectR(results, freqi=i)
        std_after.append(_Rstd(R))
    msg = ('aligned sites for all frequencies, reduction of mean stdev: ' +
           ', '.join(Nf * ['%.2g']) + ' -> ' + ', '.join(Nf * ['%.2g']))
    log.info(msg, *(std_before + std_after))
    return results