Source code for check_utilities

"""module for checking if source needs to be updated in Knowledge Network (KN).

Contains the class SrcClass which serves as the base class for each supported
source in the KN.

Contains module functions::

    get_SrcClass(args)
    compare_versions(SrcClass)
    check(module, args=None)
    main_parse_args()

Examples:
    To run check on a single source (e.g. dip)::

        $ python3 code/check_utilities.py dip

    To view all optional arguments that can be specified::

        $ python3 code/check_utilities.py -h

"""

import urllib.request
import urllib.error
import os
import time
import json
import csv
import sys
from argparse import ArgumentParser
import config_utilities as cf
import table_utilities as tu
import mysql_utilities as mu
import import_utilities as iu

[docs]class SrcClass(object): """Base class to be extended by each supported source in KnowEnG. This SrcClass provides default functions that should be extended or overridden by any source which is added to the Knowledge Network (KN). Attributes: name (str): The name of the remote source to be included in the KN. url_base (str): The base url of the remote source, which may need additional processing to provide an actual download link (see get_remote_url). aliases (dict): A dictionary with subsets of the source which will be included in the KN as the keys (e.g. different species, data types, or interaction types), and a short string with information about the alias as the value. remote_file (str): The name of the file to extract if the remote source is a directory version (dict): The release version of each alias in the source. source_url (str): The website for the source. reference (str): The citation for the source. pmid (str): The pubmed ID for the source. license (str): The license for the source. """ def __init__(self, src_name, base_url, aliases, args=None): """Init a SrcClass object with the provided parameters. Constructs a SrcClass object with the provided parameters, which should be provided by any class extending SrcClass. Args: src_name (str): The name of the remote source to be included in the KN. Must be provided by the extending class. url_base (str): The base url of the remote source, which may need additional processing to provide an actual download link (see get_remote_url). Must be provided by the extending class. aliases (dict): A dictionary with subsets of the source which will be included in the KN as the keys (e.g. different species, data types, or interaction types), and a short string with information about the alias as the value. args (Namespace): args as populated namespace or 'None' for defaults """ if args is None: args = cf.config_args() self.name = src_name self.url_base = base_url self.aliases = aliases self.remote_file = '' self.version = dict() self.args = args self.chunk_size = 500000
[docs] def get_aliases(self, args=cf.config_args()): """Helper function for producing the alias dictionary. This returns a dictionary where alias names are keys and alias info are the values. This helper function uses the species specific information for the build of the Knowledge Network, which is produced by ensembl.py during setup utilities and is located at cf.DEFAULT_MAP_PATH/species/species.json, in order to fetch all matching species specific aliases from the source. Args: args (Namespace): args as populated namespace or 'None' for defaults Returns: dict: A dictionary of species:(taxid, division) values """ return dict()
[docs] def get_source_version(self, alias): """Return the release version of the remote source:alias. This returns the release version of the remote source for a specific alias. This value will be the same for every alias unless the the alias can have a different release version than the source (this will be source dependent). This value is stored in the self.version dictionary object. If the value does not already exist, all aliases versions are initialized to 'unknown'. Args: alias (str): An alias defined in self.aliases. Returns: str: The remote version of the source. """ if alias not in self.version: for alias_name in self.aliases: self.version[alias_name] = 'unknown' return self.version[alias]
[docs] def get_local_file_info(self, alias): """Return a dictionary with the local file information for the alias. This returns the local file information for a given source alias, which will always contain the following keys:: 'local_file_name' (str): name of the file locally 'local_file_exists' (bool): boolean if file exists at path indicated by 'local_file_name' and will also contain the following if 'local_file_exists' is True:: 'local_size' (int): size of local file in bytes 'local_date' (float): time of last modification time of local file in seconds since the epoch Args: alias (str): An alias defined in self.aliases. Returns: dict: The local file information for a given source alias. """ f_dir = os.path.join(self.args.working_dir, self.args.data_path, self.name) f_dir = os.path.join(f_dir, alias) url = self.get_remote_url(alias) filename = os.path.basename(url).replace('%20', '_') file = os.path.join(f_dir, filename) local_dict = dict() local_dict['local_file_name'] = filename local_dict['local_file_exists'] = os.path.isfile(file) if not local_dict['local_file_exists']: return local_dict local_dict['local_size'] = os.path.getsize(file) local_dict['local_date'] = os.path.getmtime(file) return local_dict
[docs] def get_local_version_info(self, alias, args): """Return a dictionary with the local information for the alias. This returns the local information for a given source alias, as retrieved from the msyql database and formated as a dicitonary object. (see mysql_utilities.get_file_meta). It adds the local_file_name and local_file_exists to the fields retrieved from the database, which are the name of the file locally and a boolean indicating if it already exists on disk, respectively. Args: alias (str): An alias defined in self.aliases. Returns: dict: The local file information for a given source alias. """ file_id = '.'.join([self.name, alias]) file_meta = mu.get_file_meta(file_id, args) f_dir = os.path.join(self.args.working_dir, self.args.data_path, self.name) f_dir = os.path.join(f_dir, alias) url = self.get_remote_url(alias) filename = os.path.basename(url).replace('%20', '_') file = os.path.join(f_dir, filename) file_meta['local_file_name'] = filename file_meta['local_file_exists'] = os.path.isfile(file) return file_meta
[docs] def get_remote_file_size(self, alias): """Return the remote file size. This returns the remote file size as specificied by the 'content-length' page header. If the remote file size is unknown, this value should be -1. Args: remote_url (str): The url of the remote file to get the size of. Returns: int: The remote file size in bytes. """ remote_url = self.get_remote_url(alias) try: response = urllib.request.urlopen(remote_url) print(response.headers) return int(response.headers['content-length']) except (TypeError, ValueError, urllib.error.URLError): return -1
[docs] def get_remote_file_modified(self, alias): """Return the remote file date modified. This returns the remote file date modifed as specificied by the 'last-modified' page header. Args: remote_url (str): The url of the remote file to get the date modified of. Returns: float: time of last modification time of remote file in seconds since the epoch """ remote_url = self.get_remote_url(alias) try: response = urllib.request.urlopen(remote_url) time_str = response.headers['last-modified'] time_format = "%a, %d %b %Y %H:%M:%S %Z" return time.mktime(time.strptime(time_str, time_format)) except (urllib.error.URLError, ValueError, TypeError, ConnectionResetError): return float(0)
[docs] def get_remote_url(self, alias): """Return the remote url needed to fetch the file corresponding to the alias. This returns the url needed to fetch the file corresponding to the alias. By default this returns self.base_url. Args: alias (str): An alias defined in self.aliases. Returns: str: The url needed to fetch the file corresponding to the alias. """ return self.url_base
[docs] def is_map(self, alias): """Return a boolean representing if the provided alias is used for source specific mapping of nodes or edges. This returns a boolean representing if the alias corresponds to a file used for mapping. By default this returns True if the alias ends in '_map' and False otherwise. Args: alias (str): An alias defined in self.aliases. Returns: bool: Whether or not the alias is used for mapping. """ return alias[-4:] == '_map'
[docs] def get_dependencies(self, alias): """Return a list of other aliases that the provided alias depends on. This returns a list of other aliases that must be processed before full processing of the provided alias can be completed. By default, returns a list of all aliases which are considered mapping files (see is_map) Args: alias(str): An alias defined in self.aliases. Returns: list: The other aliases defined in self.aliases that the provided alias depends on. """ depends = list() if self.is_map(alias): return depends for alias_name in self.aliases: if alias_name == alias: continue elif self.is_map(alias_name): depends.append(alias_name) return depends
[docs] def create_mapping_dict(self, filename, key_col=3, value_col=4): """Return a mapping dictionary for the provided file. This returns a dictionary for use in mapping nodes or edge types from the file specified by filetype. By default it opens the file specified by filename creates a dictionary using the key_col column as the key and the value_col column as the value. Args: filename (str): The name of the file containing the information needed to produce the maping dictionary. key_col (int): The column containing the key for creating the dictionary. By default this is column 3. value_col (int): The column containing the value for creating the dictionary. By default this is column 4. Returns: dict: A dictionary for use in mapping nodes or edge types. """ src = filename.split('.')[0] alias = filename.split('.')[1] map_dict = dict() n_meta_file = filename.replace('raw_line', 'node_meta') node_file = filename.replace('raw_line', 'node') if not self.is_map(alias): return map_dict with open(filename, 'rb') as map_file, \ open(n_meta_file, 'w') as n_meta, \ open(node_file, 'w') as nfile: reader = csv.reader((line.decode('utf-8') for line in map_file), delimiter='\t') n_meta_writer = csv.writer(n_meta, delimiter='\t', lineterminator='\n') n_writer = csv.writer(nfile, delimiter='\t', lineterminator='\n') for line in reader: chksm = line[2] orig_id = line[key_col].strip() orig_name = line[value_col].strip() kn_id = cf.pretty_name(orig_id) kn_name = cf.pretty_name(src + '_' + orig_name) map_dict[orig_id] = kn_id + '::' + kn_name n_writer.writerow([kn_id, kn_name]) n_meta_writer.writerow([kn_id, 'orig_desc', orig_name]) n_meta_writer.writerow([kn_id, 'orig_id', orig_id]) outfile = node_file.replace('node', 'unique.node') tu.csu(node_file, outfile) outfile = n_meta_file.replace('node_meta', 'unique.node_meta') tu.csu(n_meta_file, outfile) return map_dict
[docs] def table(self, raw_line, version_dict): """Uses the provided :ref:`raw_lines file <rawline-label>` to produce :ref:`a table file <table-label>`, :ref:`an edge_meta file <edge_meta-label>`, and :ref:`a node_meta file <node_meta-label>` (only for property nodes). This returns nothing but produces the table formatted files from the provided raw_lines file:: raw_lines (file, line num, line_chksum, raw_line) table table (line_cksum, n1name, n1hint, n1type, n1spec, n2name, n2hint, n2type, n2spec, et_hint, score) edge_meta (line_cksum, info_type, info_desc) node_meta (node_id, info_type (alt_alias, relationship, experiment, or link), info_desc (text)) By default this function does nothing (must be overridden) Args: raw_line (str): The path to the raw_lines file version_dict (dict): A dictionary describing the attributes of the alias for a source. """ return
[docs]def get_SrcClass(args, *posargs, **kwargs): """Returns an object of the source class. This returns an object of the source class to allow access to its functions if the module is imported. Args: args (Namespace): args as populated namespace or 'None' for defaults Returns: SrcClass: a source class object """ return SrcClass(args, *posargs, **kwargs)
[docs]def compare_versions(src_obj, args=None): """Return a dictionary with the version information for each alias in the source and write a dictionary for each alias to file. This returns a nested dictionary describing the version information of each alias in the source. The version information is also printed. Args: src_obj (SrcClass): A SrcClass object for which the comparison should be performed. args (Namespace): args as populated namespace or 'None' for defaults Returns: dict: A nested dictionary describing the version information for each alias described in src_obj. For each alias the following keys are defined:: 'source' (str): The source name, 'alias' (str): The alias name, 'alias_info' (str): A short string with information about the alias, 'is_map' (bool): See is_map, 'dependencies' (lists): See get_dependencies, 'remote_url' (str): See get_remote_url, 'remote_date' (float): See get_remote_file_modified, 'remote_version' (str): See get_source_version, 'remote_file' (str): File to extract if remote file location is a directory, 'remote_size' (int): See get_remote_file_size, 'local_file_name' (str): See get_local_version_info, 'file_exists' (bool): See get_local_version_info, 'fetch_needed' (bool): True if file needs to be downloaded from remote source. A fetch will be needed if the local file does not exist, or if the local and remote files have different date modified or file sizes. """ version_dict = dict() file_meta = dict() for alias in src_obj.aliases: print('Comparing versions for {0}'.format(alias)) file_meta[alias] = src_obj.get_local_version_info(alias, args) version_dict[alias] = dict() version_dict[alias]['source'] = src_obj.name version_dict[alias]['alias'] = alias version_dict[alias]['alias_info'] = src_obj.aliases[alias] version_dict[alias]['is_map'] = src_obj.is_map(alias) version_dict[alias]['dependencies'] = src_obj.get_dependencies(alias) remote_url = src_obj.get_remote_url(alias) version_dict[alias]['remote_url'] = remote_url version_dict[alias]['remote_file'] = src_obj.remote_file version_dict[alias]['remote_date'] = \ src_obj.get_remote_file_modified(alias) version_dict[alias]['remote_version'] = \ src_obj.get_source_version(alias) version_dict[alias]['remote_size'] = src_obj.get_remote_file_size(alias) version_dict[alias]['local_file_name'] = \ file_meta[alias]['local_file_name'] version_dict[alias]['file_exists'] = \ file_meta[alias]['file_exists'] version_dict[alias]['source_url'] = src_obj.source_url version_dict[alias]['image'] = src_obj.image version_dict[alias]['reference'] = src_obj.reference version_dict[alias]['pmid'] = src_obj.pmid version_dict[alias]['license'] = src_obj.license if not file_meta[alias]['file_exists']: version_dict[alias]['fetch_needed'] = True continue l_size = file_meta[alias]['size'] r_size = version_dict[alias]['remote_size'] l_date = file_meta[alias]['date'] r_date = version_dict[alias]['remote_date'] l_version = file_meta[alias]['version'] r_version = version_dict[alias]['remote_version'] if r_size == -1 and r_date == 0 and r_version == 'unknown': version_dict[alias]['fetch_needed'] = True elif l_size == r_size and l_date == r_date and l_version == r_version: version_dict[alias]['fetch_needed'] = False else: version_dict[alias]['fetch_needed'] = True f_dir = os.path.join(src_obj.args.working_dir, src_obj.args.data_path, src_obj.name) os.makedirs(f_dir, exist_ok=True) for alias in src_obj.aliases: a_dir = os.path.join(f_dir, alias) os.makedirs(a_dir, exist_ok=True) f_name = os.path.join(a_dir, 'file_metadata.json') with open(f_name, 'w') as outfile: json.dump(version_dict[alias], outfile, indent=4, sort_keys=True) #print(json.dumps(version_dict, indent=4, sort_keys=True)) print("printing file_metadata.json") return version_dict
[docs]def check(module, args=None): """Runs compare_versions(SrcClass) on a 'module' object This runs the compare_versions function on a 'module' object to find the version information of the source and determine if a fetch is needed. The version information is also printed. Args: module (str): string name of module defining source specific class args (Namespace): args as populated namespace or 'None' for defaults Returns: dict: A nested dictionary describing the version information for each alias described in source. """ if args is None: args = cf.config_args() src_code_dir = os.path.join(args.code_path, args.src_path) sys.path.append(src_code_dir) src_module = __import__(module) SrcClass = src_module.get_SrcClass(args) version_dict = compare_versions(SrcClass, args) for alias in version_dict: iu.import_filemeta(version_dict[alias], args) return version_dict
[docs]def main_parse_args(): """Processes command line arguments. Expects three positional arguments(start_step, deploy_loc, run_mode) and a number of optional arguments. If arguments are missing, supplies default values. Returns: Namespace: args as populated namespace """ parser = ArgumentParser() parser.add_argument('module', help='select SrcClass to check, e.g. dip') parser = cf.add_config_args(parser) args = parser.parse_args() return args
if __name__ == "__main__": args = main_parse_args() check(args.module, args)