Source code for sanitize_utilities

#!/usr/bin/env python3

import collections

VALID_NORMALIZE_METHODS = [None, 'type']
DEFAULT_NORMALIZE_METHOD = VALID_NORMALIZE_METHODS[1]

# These mean, as indicated by the required uniqueness condition on edges:
# 'type': unique based on {node1, node2, type}
# 'node': unique based on {node1, node2}
# 'exact': unique based on {node1, node2, weight, type}
# They are listed in order of preference/likelihood.
VALID_DROP_DUPLICATES_METHOD = [None, 'type', 'node', 'exact']
DROP_DUPLICATES_METHOD = VALID_DROP_DUPLICATES_METHOD[1]


[docs]def add_config_args(parser): #NEW, UNDOC
    """
    Add arguments specific to this module.

    Parameters:
        parser (argparse.parser): the parser to add arguments to

    Returns:
        argparse.parser: the parser with the arguments added
    """
    parser.add_argument('-und', '--make_undirected', action='store_true')
    parser.add_argument('-unw', '--make_unweighted', action='store_true')
    parser.add_argument('-norm', '--normalize', action='store_true')
    parser.add_argument('-normm', '--normalize_method', type=str,
                        choices=VALID_NORMALIZE_METHODS,
                        default=DEFAULT_NORMALIZE_METHOD)
    parser.add_argument('-na', '--ignore_nas', action='store_true')
    parser.add_argument('-v', '--verbose', action='store_true')

    return parser


[docs]def make_network_unweighted(n_df, wgt):
    """
    Make the network unweighted, by setting the weights on all the edges to the
    same value (1).

    Parameters:
        n_df (list): the data
        wgt (int): the weight column

    Returns:
        list: the modified data
    """
    return [n[:wgt] + [1] + n[wgt+1:] for n in n_df]


[docs]def make_network_undirected(n_df):
    """
    Make the network undirected; that is, the network should be symmetric, but
    only the edges in one direction are included.  So make the edges in the
    other direction explicit in the network. This assumes that the first two
    columns are the two nodes.

    Parameters:
        n_df (list): the data

    Returns:
        list: the modified data
    """

    return n_df + [[n[1], n[0]] + n[2:] for n in n_df]


[docs]def sort_network(n_df):
    """
    Sort the network.

    Parameters:
        n_df (list): the data

    Returns:
        list: the modified data
    """
    return sorted(n_df, reverse=True)


[docs]def drop_duplicates_by_type_or_node(n_df, n1, n2, typ):
    """
    Drop the duplicates in the network, by type or by node.

    For each set of "duplicate" edges, only the edge with the maximum weight
    will be kept.

    By type, the duplicates are where nd1, nd2, and typ are identical; by node,
    the duplicates are where nd1, and nd2 are identical.

    Parameters:
        n_df (list): the data
        n1 (int): the column for the firts node
        n2 (int): the column for the second node
        typ (int): the column for the type

    Returns:
        list: the modified data
    """
    # If n_df is sorted, this method will work, iterating through the
    # rows and only keeping the first row of a group of duplicate rows
    prev_nd1_val = None
    prev_nd2_val = None
    prev_type_val = None

    new_n_df = []

    for row in n_df:
        nd1_val = row[n1]
        nd2_val = row[n2]
        type_val = row[typ]
        nodes_differ = nd1_val != prev_nd1_val or nd2_val != prev_nd2_val
        type_differs = type_val != prev_type_val
        if (DROP_DUPLICATES_METHOD == 'node' and nodes_differ) or (nodes_differ or type_differs):
            new_n_df.append(row)
        prev_nd1_val = nd1_val
        prev_nd2_val = nd2_val
        prev_type_val = type_val

    return new_n_df


[docs]def normalize_network_by_type(n_df, typ, wgt):
    """
    Normalize the network.

    Currently the only normalization method implemented is by type.

    Parameters:
        n_df (list): the data
        typ (int): the type column
        wgt (int): the weight column

    Returns:
        list: the modified data
    """
    sums = collections.Counter()
    for i in n_df:
        sums[i[typ]] += i[wgt]

    return [i[:wgt] + ("{:.6g}".format(i[wgt]/sums[i[typ]]),) + i[wgt+1:] for i in n_df]


[docs]def upper_triangle(n_df, n1, n2):
    """Makes a (sparse) matrix upper triangular.
    """
    return [edge for edge in n_df if edge[n1] < edge[n2]]