diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f9fff64a --- /dev/null +++ b/.gitignore @@ -0,0 +1,16 @@ +__pycache__/ +*.py[cod] +*$py.class +.python-version +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +.pyc +.profraw +.dat +.DS_Store +._* diff --git a/WORKSPACE b/WORKSPACE index de5933f4..e75554fb 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -1,4 +1,5 @@ workspace(name="datacommons") +load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository") # The following rules are needed to perform pip-install of dependencies. # Reference: https://github.com/bazelbuild/rules_python diff --git a/datacommons/__init__.py b/datacommons/__init__.py index 2b130cf5..effee554 100644 --- a/datacommons/__init__.py +++ b/datacommons/__init__.py @@ -13,4 +13,7 @@ # limitations under the License. """Data Commons module.""" -from .datacommons import Client +from .bio import BioExtension, DEFAULT_BEDLINE_PROPS +from .datacommons import Client, DCNode, DCFrame +from .places import PlacesExtension +from .utils import DatalogQuery, MeasuredValue diff --git a/datacommons/bio.py b/datacommons/bio.py new file mode 100644 index 00000000..9b6c821f --- /dev/null +++ b/datacommons/bio.py @@ -0,0 +1,368 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data Commons Bio Data API Extension. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from collections import OrderedDict +from types import MethodType +from .datacommons import DCFrame +from . import utils + +# Root url of the ENCODE Project experiment targets. +_ENCODE_TARGET_URL = 'https://www.encodeproject.org/targets/{}' + +# Default maximum number of rows to return in a query +_MAX_ROWS = 100 + +# Default bedline properties to query for. Each line has the new column name, +# the property to query for, and the type mapped to by the property. +DEFAULT_BEDLINE_PROPS = [ + ('Chromosome', 'chromosome', 'Text'), + ('StartPos', 'chromosomeStart', 'Integer'), + ('EndPos', 'chromosomeEnd', 'Integer'), + ('BedName', 'bedName', 'Text'), + ('BedScore', 'bedScore', 'Integer'), + ('Strand', 'chromosomeStrand', 'Text') +] + + +def BioExtension(frame): + """ The DataCommons bio extension API. """ + frame.get_experiments = MethodType(get_experiments, frame) + frame.get_bed_files = MethodType(get_bed_files, frame) + frame.get_bed_lines = MethodType(get_bed_lines, frame) + return frame + +def get_experiments(self, new_col_name, **kwargs): + """ Creates a column of experiments in the dataframe. + + See argument section for valid keyword arguments. + + Args: + new_col_name: The name of the new column created + assay_category: A list of assay categories investigated by the experiment. + assay_target: A list of assay targets investigated by the experiment. + bio_class: A list of biosample classes to filter by. This argument must be + provided with a biosample type. The i-th biosample class with the i-th + biosample type describes one biosample to filter for. + bio_term: A list of biosample types. This argument must be provided with + a biosample class. The i-th biosample class with the i-th biosample class + describes one biosample to filter for. + lab_name: A list of lab names that published the experiment. + rows: The maximum number of rows to query for + """ + if new_col_name in self._dataframe: + raise ValueError('{} is already a column.'.format(new_col_name)) + if ('bio_class' in kwargs) != ('bio_term' in kwargs): + raise ValueError('Only one of bio_class or bio_term is specified.') + if 'bio_class' in kwargs and len(kwargs['bio_class']) != len(kwargs['bio_term']): + raise ValueError('Number of bio_class and bio_term parameters must match.') + + # Get the query variable and type hint + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {new_col_var: new_col_name} + type_hint = {new_col_var: 'EncodeExperiment'} + + # Get the row limit + rows = _MAX_ROWS + if 'rows' in kwargs: + rows = kwargs['rows'] + + # Row selection and table post processing needed if bio summary provided. + select = None + process = None + + # Construct the query + query = utils.DatalogQuery() + query.add_variable(new_col_var) + query.add_constraint(new_col_var, 'typeOf', 'EncodeExperiment') + if 'assay_category' in kwargs: + categories = ['"{}"'.format(val) for val in kwargs['assay_category']] + query.add_constraint(new_col_var, 'assaySlims', categories) + if 'assay_target' in kwargs: + target_urls = [_ENCODE_TARGET_URL.format(target) in kwargs['assay_target']] + targets = ['"{}"'.format(url) for url in target_urls] + query.add_constraint(new_col_var, 'target', targets) + if 'assembly' in kwargs: + assemblies = ['"{}"'.format(val) for val in kwargs['assembly']] + query.add_constraint(new_col_var, 'assembly', assemblies) + if 'bio_class' in kwargs and 'bio_term' in kwargs: + classes = ['"{}"'.format(val) for val in kwargs['bio_class']] + terms = ['"{}"'.format(val) for val in kwargs['bio_term']] + query.add_variable('?bioClass', '?bioTerm') + query.add_constraint(new_col_var, 'biosampleOntology', '?bioNode') + query.add_constraint('?bioNode', 'classification', classes) + query.add_constraint('?bioNode', 'termName', terms) + query.add_constraint('?bioNode', 'classification', '?bioClass') + query.add_constraint('?bioNode', 'termName', '?bioTerm') + + # Specify select and process functions to filter for biosample class and + # terms. This enforces the paired-ness of term and class + select = select_biosample_summary('?bioClass', '?bioTerm', classes, terms) + process = utils.delete_column('?bioClass', '?bioTerm') + if 'lab_name' in kwargs: + lab_names = ['"{}"'.format(name) for name in kwargs['lab_name']] + query.add_constraint(new_col_var, 'lab', '?labNode') + query.add_constraint('?labNode', 'name', lab_names) + + # Perform the query and merge + new_frame = DCFrame(datalog_query=query, + labels=labels, + type_hint=type_hint, + select=select, + process=process, + rows=rows) + self.merge(new_frame) + +def get_bed_files(self, seed_col_name, new_col_name, **kwargs): + """ Adds a column of bed file DCIDs associated with a column of experiments. + + If the seed_col_type is Text, the function will infer the column to contain + lab names. + + Args: + seed_col_name: The name of the experiment column + new_col_name: The name of the new column created + lab_name: Names of labs publishing the bed files + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a column in the frame.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError('{} is already a column.'.format(new_col_name)) + + # Get the row limit + rows = _MAX_ROWS + if 'rows' in kwargs: + rows = kwargs['rows'] + + # Get the variables + seed_col_var = '?' + seed_col_name.replace(' ', '') + new_col_var = '?' + new_col_name.replace(' ', '') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + + # Get typing information + seed_col_type = self._col_types[seed_col_name] + new_col_type = 'EncodeBedFile' + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + # Construct the query + query = utils.DatalogQuery() + query.add_variable(seed_col_var, new_col_var) + query.add_constraint(new_col_var, 'fromExperiment', '?experimentNode') + + # Add constraints based on if the seed is a column of experiments or lab names + seed_col = list(self._dataframe[seed_col_name]) + if not seed_col: + raise ValueError('Seed column {} is empty.'.format(seed_col_name)) + if seed_col_type == 'EncodeExperiment': + query.add_constraint('?experimentNode', 'dcid', seed_col_var) + query.add_constraint('?experimentNode', 'dcid', seed_col) + elif seed_col_type == 'Text': + lab_names = ['"{}"'.format(name) for name in seed_col] + query.add_constraint(new_col_var, 'lab', '?labNode') + query.add_constraint('?labNode', 'name', seed_col_var) + query.add_constraint('?labNode', 'name', lab_names) + else: + raise ValueError('Invalid seed column type {} for column {}.' + .format(seed_col_type, seed_col_name)) + + # Perform the query and merge + new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows) + self.merge(new_frame) + +def get_bed_lines(self, seed_col_name, prop_info=DEFAULT_BEDLINE_PROPS, **kwargs): + """ Adds a column of bed file DCIDs associated with a column of experiments. + + See argument section for valid keyword arguments. + + Args: + seed_col_name: The name of the experiment column. + prop_info: A list of tuples specifying which properties in a BedLine to + query for. Each tuple contains: + (1) The name of the new column. + (2) The property to query for. + (3) The type mapped to by the property. + See DEFAULT_BEDLINE_PROPS for more details + chromosome: A list of chromosomes to filter for. + start_pos: A list of start positions to filter measurements for. This must + be provided with end_pos with a list of the same length. The i-th start + position with the i-th end position defines one interval to filter for. + end_pos: A list of end positions to filter measurements for. This must be + provided with start_pos with a list of the same length. The i-th start + position with the i-th end position defines one interval to filter for. + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a column in the frame.'.format(seed_col_name)) + if ('start_pos' in kwargs) != ('end_pos' in kwargs): + raise ValueError('Must provide both start_pos and end_pos.') + if 'start_pos' in kwargs and len(kwargs['start_pos']) != len(kwargs['end_pos']): + raise ValueError('Length of start_pos must equal length of end_pos.') + + # Get the row limit + rows = _MAX_ROWS + if 'rows' in kwargs: + rows = kwargs['rows'] + + # Get seed column information + seed_col = list(self._dataframe[seed_col_name]) + seed_col_var = '?' + seed_col_name.replace(' ', '') + seed_col_type = self._col_types[seed_col_name] + + if not seed_col: + raise ValueError('Seed column {} is empty.'.format(seed_col_name)) + + # Create the query + query = utils.DatalogQuery() + query.add_variable(seed_col_var) + query.add_constraint('?bedFileNode', 'dcid', seed_col) + query.add_constraint('?bedFileNode', 'dcid', seed_col_var) + query.add_constraint('?bedLineNode', 'typeOf', 'BedLine') + query.add_constraint('?bedLineNode', 'fromBedFile', '?bedFileNode') + + # If filtering by a specific parameter, need to store which variable is + # querying for the parameter being filtered by. + chromosome_var = None + start_pos_var = None + end_pos_var = None + + # Add specific properties to query from the bed line + labels, type_hint = {}, {} + for prop_line in prop_info: + query_var = '?' + prop_line[0].replace(' ', '') + labels[query_var] = prop_line[0] + type_hint[query_var] = prop_line[2] + + # Add variable and constraint + query.add_variable(query_var) + query.add_constraint('?bedLineNode', prop_line[1], query_var) + + # Store variable if filtering by appropriate parameter + if 'chromosome' in kwargs and prop_line[1] == 'chromosome': + chromosome_var = query_var + if 'start_pos' in kwargs and prop_line[1] == 'chromosomeStart': + start_pos_var = query_var + if 'end_pos' in kwargs and prop_line[1] == 'chromosomeEnd': + end_pos_var = query_var + + # Create filters based on the parameters. Edit the query if certain variables + # are not being queried for and post process the result as necessary. + select, process, select_funcs, drop_cols = None, None, [], [] + if 'chromosome' in kwargs: + if not chromosome_var: + chromosome_var = '?chromosome' + query.add_variable(chromosome_var) + query.add_constraint('?bedLineNode', 'chromosome', chromosome_var) + drop_cols.append(chromosome_var) + select_funcs.append(select_chromosome(chromosome_var, kwargs['chromosome'])) + if 'start_pos' in kwargs: + if not start_pos_var: + start_pos_var = '?chromStart' + query.add_variable(start_pos_var) + query.add_constraint('?bedLineNode', 'chromosomeStart', start_pos_var) + drop_cols.append(start_pos_var) + if not end_pos_var: + end_pos_var = '?chromEnd' + query.add_variable(end_pos_var) + query.add_constraint('?bedLineNode', 'chromosomeEnd', end_pos_var) + drop_cols.append(end_pos_var) + select_funcs.append( + select_chrom_pos(start_pos_var, + end_pos_var, + kwargs['start_pos'], + kwargs['end_pos'])) + + # If filters were specified, compose the filters and add a post processor if + # necessary. + if select_funcs: + select = utils.compose_select(*select_funcs) + if drop_cols: + process = utils.delete_column(*drop_cols) + + # Perform the query and merge + new_frame = DCFrame(datalog_query=query, + labels=labels, + type_hint=type_hint, + select=select, + process=process, + rows=rows) + self.merge(new_frame) + +# ----------------------------- HELPER FUNCTIONS ------------------------------ + +def select_biosample_summary(bio_class_col, bio_term_col, bio_classes, bio_terms): + """ Returns a filter function which filters for appropriate summaries. + + When bio_class and bio_term are specified for querying experiments, the + returned experiments must match the i-th bio_class and i-th bio term for some + given i. It cannot match a bio_term and bio_class at different indices in the + given parameters. + + Args: + bio_class_col: The name of the bio_class column + bio_term_col: The name of the bio_term column + bio_classes: Allowed biosample classes + bio_terms: Allowed biosample types + + Returns: + A function filtering the dataframe for appropriately ordered bio summaries. + """ + def select(row): + for bio_class, bio_term in zip(bio_classes, bio_terms): + bio_class = bio_class.replace('"', '') + bio_term = bio_term.replace('"', '') + if row[bio_class_col] == bio_class and row[bio_term_col] == bio_term: + return True + return False + return select + +def select_chromosome(chrom_col, chromosomes): + """ Returns a filter function which filters for given chromosomes. + + Args: + chrom_col: Name of the column containing chromosome information. + chromosomes: Allowed chromosomes to filter for. + + Returns: + A function filtering for the given chromosomes. + """ + def select(row): + return row[chrom_col] in chromosomes + return select + +def select_chrom_pos(start_pos_col, end_pos_col, start_pos, end_pos): + """ Returns a function which filters for chromosomes in the given intervals. + + Args: + start_pos_col: Name of column containing the start position + end_pos_col: Name of column containing the end position + start_pos: Allowed start positions + end_pos: Allowed end positions + + Returns: + A function filtering for measurements in the given intervals. + """ + def select(row): + for start, end in zip(start_pos, end_pos): + if row[start_pos_col] and row[end_pos_col]\ + and int(row[start_pos_col]) >= start\ + and int(row[end_pos_col]) <= end: + return True + return False + return select diff --git a/datacommons/datacommons.py b/datacommons/datacommons.py index 71346629..ff1c13bd 100644 --- a/datacommons/datacommons.py +++ b/datacommons/datacommons.py @@ -11,65 +11,60 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Data Commons Public API. +""" DataCommons base public API. +Contains Client which connects to the DataCommons knowledge graph, DCNode which +wraps a node in the graph, and DCFrame which provides a tabular view of graph +data. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from collections import defaultdict -import datetime -import json -from itertools import product +from collections import defaultdict, OrderedDict from . import _auth +from . import utils +import json import pandas as pd -_PLACES = ('City', 'County', 'State', 'Country', 'Continent') +# Database paths +# TODO(antaresc): set default path to BQ path once query is stable. +_BIG_QUERY_PATH = 'google.com:datcom-store-dev.dc_v3_clustered' -_CLIENT_ID = ('66054275879-a0nalqfe2p9shlv4jpra5jekfkfnr8ug.apps.googleusercontent.com') +# Standard API Server target +_CLIENT_ID = '66054275879-a0nalqfe2p9shlv4jpra5jekfkfnr8ug.apps.googleusercontent.com' _CLIENT_SECRET = 'fuJy7JtECndEXgtQA46hHqqa' _API_ROOT = 'https://datcom-api.appspot.com' -_MICRO_SECONDS = 1000000 -_EPOCH_START = datetime.datetime(year=1970, month=1, day=1) - - -def _year_epoch_micros(year): - """Get the timestamp of the start of a year in micro seconds. - - Args: - year: An integer number of the year. - - Returns: - Timestamp of the start of a year in micro seconds. - """ - now = datetime.datetime(year=year, month=1, day=1) +# Sandbox API Server target +_SANDBOX_CLIENT_ID = '381568890662-ff9evnle0lj0oqttr67p2h6882d9ensr.apps.googleusercontent.com' +_SANDBOX_CLIENT_SECRET = '77HJA4S5m48Z98UKkW_o-jAY' +_SANDBOX_API_ROOT = 'https://datcom-api-sandbox.appspot.com' - return int((now - _EPOCH_START).total_seconds()) * _MICRO_SECONDS +# Encode API Server target +_ENCODE_CLIENT_ID = '708273713739-42iak5pck92be9q6hrafmen12tf8eht5.apps.googleusercontent.com' +_ENCODE_CLIENT_SECRET = 'Rg5gFEE0nfodRToI1d4VwLoj' +_ENCODE_API_ROOT = 'https://datcom-api-encode.appspot.com' +_PARENT_TYPES = { + 'containedInPlace': 'Place' +} -def _date_epoch_micros(date_string): - """Get the timestamp of the date string in micro seconds. - Args: - date_string: An string of date +class Client(object): + """ The basic DataCommons query client. - Returns: - Timestamp of the start of a year in micro seconds. + The Client supports querying for property types and performing arbitrary + datalog queries. """ - now = datetime.datetime.strptime(date_string, '%Y-%m-%d') - return int((now - _EPOCH_START).total_seconds()) * _MICRO_SECONDS - - -class Client(object): - """Provides Data Commons API.""" def __init__(self, - client_id=_CLIENT_ID, - client_secret=_CLIENT_SECRET, - api_root=_API_ROOT): + db_path=_BIG_QUERY_PATH, + client_id=_SANDBOX_CLIENT_ID, + client_secret=_SANDBOX_CLIENT_SECRET, + api_root=_SANDBOX_API_ROOT): + self._db_path = db_path self._service = _auth.do_auth(client_id, client_secret, api_root) response = self._service.get_prop_type(body={}).execute() self._prop_type = defaultdict(dict) @@ -80,15 +75,37 @@ def __init__(self, self._inv_prop_type[t['prop_type']][t['prop_name']] = t['node_type'] self._inited = True - def query(self, datalog_query, max_rows=100): - """Performs a query returns results as a table. + def property_type(self, ent_type, property, outgoing=True): + """Returns the type pointed to by the given property and entity type. + + Args: + ent_type: The entity type + property: The property relating the given entity type to another. + outgoing: Whether or not the property points away or towards the given + entity. By default this is set to true. + + Returns: + The type of the second enity contained in a triple formed from the given + entity type and property. Returns none if no such property, entity type + combination exists. + """ + if outgoing and property in self._prop_type: + return self._prop_type[ent_type][property] + elif not outgoing and property in self._inv_prop_type: + return self._inv_prop_type[ent_type][property] + elif not outgoing and property in _PARENT_TYPES: + return _PARENT_TYPES[property] + return None + + def query(self, datalog_query, rows=100): + """ Returns a Pandas DataFrame with the results of the given datalog query. Args: - datalog_query: string representing datalog query in [TODO(shanth): link] - max_rows: max number of returned rows. + datalog_query: String representing datalog query in [TODO(shanth): link] + rows: Max number of returned rows. Set rows to -1 to return all results. Returns: - A pandas.DataFrame with the selected variables in the query as the + A Pandas.DataFrame with the selected variables in the query as the the column names. If the query returns multiple values for a property then the result is flattened into multiple rows. @@ -97,499 +114,415 @@ def query(self, datalog_query, max_rows=100): """ assert self._inited, 'Initialization was unsuccessful, cannot execute Query' + # Append the options + options = {} + if self._db_path: + options['db'] = self._db_path + if rows >= 0: + options['row_count_limit'] = rows + + # Send the query to the DataCommons query service try: - response = self._service.query(body={ + response = self._service.query_table(body={ 'query': datalog_query, - 'options': { - 'row_count_limit': max_rows - } + 'options': options }).execute() - except Exception as e: # pylint: disable=broad-except - raise RuntimeError('Failed to execute query: %s' % e) + except Exception as e: + msg = 'Failed to execute query:\n Query: {}\n Error: {}'.format(datalog_query, e) + raise RuntimeError(msg) + # Format and return the result as a DCFrame header = response.get('header', []) rows = response.get('rows', []) - result_dict = {header: [] for header in header} + result_dict = OrderedDict([(h, []) for h in header]) for row in rows: - cells = row.get('cells', []) - if len(cells) != len(header): - raise RuntimeError( - 'Response #cells mismatches #header: {}'.format(response)) - cell_values = [] - for key, cell in zip(header, cells): - if not cell: - cell_values.append(['']) - else: - try: - cell_values.append(cell['value']) - except KeyError: - raise RuntimeError('No value in cell: {}'.format(row)) - - # Iterate through the cartesian product to flatten the query results. - for values in product(*cell_values): - for idx, key in enumerate(header): - result_dict[key].append(values[idx]) - - return pd.DataFrame(result_dict)[header] - - def expand(self, - pd_table, - arc_name, - seed_col_name, - new_col_name, - outgoing=True, - max_rows=100): - """Create a new column with values for the given property. - - The existing pandas dataframe should include a column containing entity IDs - for a certain schema.org type. This function populates a new column with - property values for the entities and adds additional rows if a property has - repeated values. - - Args: - pd_table: Pandas dataframe that contains entity information. - arc_name: The property to add to the table. - seed_col_name: The column name that contains entity (ids) that the added - properties belong to. - new_col_name: New column name. - outgoing: Set this flag if the property points away from the entities - denoted by the seed column. - max_rows: The maximum number of rows returned by the query results. + for col in row['cols']: + result_dict[col['key']].append(col['value']) - Returns: - A pandas.DataFrame with the additional column and rows added. + # Create the Pandas DataFrame + return pd.DataFrame(result_dict).drop_duplicates() - Raises: - ValueError: when input argument is not valid. - """ - assert self._inited, 'Initialization was unsuccessful, cannot execute query' - try: - seed_col = pd_table[seed_col_name] - except KeyError: - raise ValueError('%s is not a valid seed column name' % seed_col_name) +class DCNode(object): + """ Wraps a node found in the DataCommons knowledge graph. Supports the + following functionalities. - if new_col_name in pd_table: - raise ValueError( - '%s is already a column name in the data frame' % new_col_name) - - seed_col_type = seed_col[0] - assert seed_col_type != 'Text', 'Parent entity should not be Text' - - dcids = seed_col[1:] - if not outgoing: - # The type for properties pointing into entities in the seed column is - # stored in "self._inv_prop_type" - if arc_name not in self._inv_prop_type[seed_col_type]: - raise ValueError( - '%s does not have incoming property %s' % (seed_col_type, arc_name)) - new_col_type = self._inv_prop_type[seed_col_type][arc_name] - - # Create the query - query = ('SELECT ?{seed_col_name} ?{new_col_name},' - 'typeOf ?node {seed_col_type},' - 'dcid ?node {dcids},' - 'dcid ?node ?{seed_col_name},' - '{arc_name} ?{new_col_name} ?node').format( - arc_name=arc_name, - seed_col_name=seed_col_name, - seed_col_type=seed_col_type, - new_col_name=new_col_name, - dcids=' '.join(dcids)) - else: - # The type for properties pointing away from entities in the seed column - # is stored in "self._prop_type" - if arc_name not in self._prop_type[seed_col_type]: - raise ValueError( - '%s does not have outgoing property %s' % (seed_col_type, arc_name)) - new_col_type = self._prop_type[seed_col_type][arc_name] - - # Create the query - query = ('SELECT ?{seed_col_name} ?{new_col_name},' - 'typeOf ?node {seed_col_type},' - 'dcid ?node {dcids},' - 'dcid ?node ?{seed_col_name},' - '{arc_name} ?node ?{new_col_name}').format( - arc_name=arc_name, - seed_col_name=seed_col_name, - seed_col_type=seed_col_type, - new_col_name=new_col_name, - dcids=' '.join(dcids)) - - # Run the query and merge the results. - return self._query_and_merge( - pd_table, - query, - seed_col_name, - new_col_name, - new_col_type, - max_rows=max_rows) - - # ----------------------- OBSERVATION QUERY FUNCTIONS ----------------------- - - def get_instances(self, col_name, instance_type, max_rows=100): - """Get a list of instance dcids for a given type. + - Querying for properties that have this node as either a subject or object. + - Querying for values in triples containing this node and a given property. + - Querying for all triples containing this node. + """ + def __init__(self, + dcid, + db_path=_BIG_QUERY_PATH, + client_id=_SANDBOX_CLIENT_ID, + client_secret=_SANDBOX_CLIENT_SECRET, + api_root=_SANDBOX_API_ROOT): + self._client = Client(db_path=db_path, + client_id=client_id, + client_secret=client_secret, + api_root=api_root) + self._dcid = dcid + + def get_properties(self, outgoing=True): + """ Returns a list of properties associated with this node. Args: - col_name: Column name for the returned column. - instance_type: String of the instance type. - max_rows: Max number of returend rows. + outgoing: whether or not the node is a subject or object. + """ + pass - Returns: - A pandas.DataFrame with instance dcids. + def get_property_values(self, property, outgoing=True): + """ Returns a list of values mapped to this node with the given property. + + Args: + outgoing: whether or not the node is a subject or object. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - query = ('SELECT ?{col_name},' - 'typeOf ?node {instance_type},' - 'dcid ?node ?{col_name}').format( - col_name=col_name, instance_type=instance_type) - type_row = pd.DataFrame(data=[{col_name: instance_type}]) + pass - try: - dcid_column = self.query(query, max_rows) - except RuntimeError as e: - raise RuntimeError('Execute query\n%s\ngot an error:\n%s' % (query, e)) + def get_triples(self): + """ Returns a list of triples where this node is either a subject or object. - return pd.concat([type_row, dcid_column], ignore_index=True) + Args: + outgoing: whether or not the node is a subject or object. + """ + pass - def get_populations(self, - pd_table, - seed_col_name, - new_col_name, - population_type, - max_rows=100, - **kwargs): - """Create a new column with population dcid. - The existing pandas dataframe should include a column containing entity IDs - for geo entities. This function populates a new column with - population dcid corresponding to the geo entity. +class DCFrame(object): + """ Provides a tabular view of the DataCommons knowledge graph. """ - Args: - pd_table: Pandas dataframe that contains geo entity dcids. - seed_col_name: The column name that contains entity (ids) that the added - properties belong to. - new_col_name: New column name. - population_type: Population type like "Person". - max_rows: The maximum number of rows returned by the query results. - **kwargs: keyword properties to define the population. + def __init__(self, + file_name=None, + datalog_query=None, + labels=None, + select=None, + process=None, + type_hint=None, + rows=100, + db_path=None, + client_id=_SANDBOX_CLIENT_ID, + client_secret=_SANDBOX_CLIENT_SECRET, + api_root=_SANDBOX_API_ROOT): + """ Initializes the DCFrame. + + A DCFrame can also be initialized by providing the file name of a cached + frame or a datalog query. When a datalog query is provided, the results + of the query are stored in the frame with selected variables set as the + column names. Additional fields such as labels, select, process, etc. can + be provided to manipuate the results of the datalog query before it is + wrapped by the DCFrame. + + The DCFrame requires typing information for the columns that it maintains. + If the frame is initialized from a query then either the query variable + types must be inferrable from the query, or it must be provided in the type + hint. - Returns: - A pandas.DataFrame with an additional column added. + Args: + file_name: File name of a cached DCTable. + datalog_query: Query object representing datalog query [TODO(shanth): link] + labels: A map from the query variables to column names in the DCFrame. + select: A function that takes in a row and returns true if the row in the + result should be added to the final DCFrame. Functions should index into + columns using column names prior to relabeling. + process: A function that takes in a Pandas DataFrame. Can be used for + post processing the results such as converting columns to certain types. + Functions should index into columns using names prior to relabeling. + type_hint: A map from column names to the type that the column contains. + db_path: The path for the database to query. + client_id: The API client id + client_secret: The API client secret + api_root: The API root url Raises: - ValueError: when input argument is not valid. + RuntimeError: some problem with executing query (hint in the string) """ - assert self._inited, 'Initialization was unsuccessful, cannot execute query' - try: - seed_col = pd_table[seed_col_name] - except KeyError: - raise ValueError('%s is not a valid seed column name' % seed_col_name) - - if new_col_name in pd_table: - raise ValueError( - '%s is already a column name in the data frame' % new_col_name) - - seed_col_type = seed_col[0] - assert seed_col_type != 'Text', 'Parent entity should not be Text' - - # Create the datalog query for the requested observations - dcids = seed_col[1:] - query = ('SELECT ?{seed_col_name} ?{new_col_name},' - 'typeOf ?node {seed_col_type},' - 'typeOf ?pop Population,' - 'dcid ?node {dcids},' - 'dcid ?node ?{seed_col_name},' - 'location ?pop ?node,' - 'dcid ?pop ?{new_col_name},' - 'populationType ?pop {population_type},').format( - new_col_name=new_col_name, - seed_col_name=seed_col_name, - seed_col_type=seed_col_type, - dcids=' '.join(dcids), - population_type=population_type) - pv_pairs = sorted(kwargs.items()) - idx = 0 - for idx, pv in enumerate(pv_pairs, 1): - query += 'p{} ?pop {},'.format(idx, pv[0]) - query += 'v{} ?pop {},'.format(idx, pv[1]) - query += 'numConstraints ?pop {}'.format(idx) - - # Run the query and merge the results. - return self._query_and_merge( - pd_table, - query, - seed_col_name, - new_col_name, - 'Population', - max_rows=max_rows) - - def get_observations(self, - pd_table, - seed_col_name, - new_col_name, - start_date, - end_date, - measured_property, - stats_type, - max_rows=100): - """Create a new column with values for an observation of the given property. - - The existing pandas dataframe should include a column containing entity IDs - for a certain schema.org type. This function populates a new column with - property values for the entities. + self._client = Client(db_path=db_path, + client_id=client_id, + client_secret=client_secret, + api_root=api_root) + self._dataframe = pd.DataFrame() + self._col_types = {} + + # Read the dataframe from cache if a file name is provided or initialize + # from a datalog query if the query is provided + if file_name: + try: + response = self._client._service.read_dataframe( + file_name=file_name + ).execute() + except Exception as e: # pylint: disable=broad-except + raise RuntimeError('Failed to read "{}": {}'.format(file_name, e)) + + # Inflate the json string. + data = json.loads(response['data']) + self._dataframe = pd.read_json(data['dataframe']) + self._col_types = data['col_types'] + elif datalog_query: + variables = datalog_query.variables() + var_types = datalog_query.var_types() + query_string = str(datalog_query) + pd_frame = self._client.query(query_string, rows=rows) + pd_frame = pd_frame.dropna() + + # If variable type is not provided in type_hint or from the query, infer + # the type as text. + for var in variables: + if var not in var_types and (type_hint is None or var not in type_hint): + var_types[var] = 'Text' + + # Processing is run the order of row filtering via select, table + # manipulation via process, and column renaming via labels, + if select: + pd_frame = pd_frame[pd_frame.apply(select, axis=1)] + if process: + pd_frame = process(pd_frame) + for col in pd_frame: + # Set the column types and remap if the column labels are provided. Only + # add types for columns that appear in the dataframe. This is critical + # as "process" may delete columns from the query result. + col_name = col + if labels and col in labels: + col_name = labels[col] + if type_hint and col in type_hint: + self._col_types[col_name] = type_hint[col] + else: + self._col_types[col_name] = var_types[col] + if labels: + pd_frame = pd_frame.rename(index=str, columns=labels) + self._dataframe = pd_frame.reset_index(drop=True) - Args: - pd_table: Pandas dataframe that contains entity information. - seed_col_name: The column that contains the population dcid. - new_col_name: New column name. - start_date: The start date of the observation (in 'YYY-mm-dd' form). - end_date: The end date of the observation (in 'YYY-mm-dd' form). - measured_property: observation measured property. - stats_type: Statistical type like "Median" - max_rows: The maximum number of rows returned by the query results. + def columns(self): + """ Returns the set of column names for this frame. Returns: - A pandas.DataFrame with an additional column added. + Set of column names for this frame. + """ + return [col for col in self._dataframe] - Raises: - ValueError: when input argument is not valid. + def types(self): + """ Returns a map from column name to associated DataCommons type. + + Returns: + Map from column name to column type. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute query' - try: - seed_col = pd_table[seed_col_name] - except KeyError: - raise ValueError('%s is not a valid seed column name' % seed_col_name) + return self._col_types - if new_col_name in pd_table: - raise ValueError( - '%s is already a column name in the data frame' % new_col_name) - - seed_col_type = seed_col[0] - assert seed_col_type == 'Population' or seed_col_type == 'City', ( - 'Parent entity should be Population' or 'City') - - # Create the datalog query for the requested observations - dcids = seed_col[1:] - query = ('SELECT ?{seed_col_name} ?{new_col_name},' - 'typeOf ?pop {seed_col_type},' - 'typeOf ?o Observation,' - 'dcid ?pop {dcids},' - 'dcid ?pop ?{seed_col_name},' - 'observedNode ?o ?pop,' - 'startTime ?o {start_time},' - 'endTime ?o {end_time},' - 'measuredProperty ?o {measured_property},' - '{stats_type}Value ?o ?{new_col_name},').format( - seed_col_type=seed_col_type, - new_col_name=new_col_name, - seed_col_name=seed_col_name, - dcids=' '.join(dcids), - measured_property=measured_property, - stats_type=stats_type, - start_time=_date_epoch_micros(start_date), - end_time=_date_epoch_micros(end_date)) - # Run the query and merge the results. - return self._query_and_merge( - pd_table, - query, - seed_col_name, - new_col_name, - 'Observation', - max_rows=max_rows) - - # -------------------------- CACHING FUNCTIONS -------------------------- - - def read_dataframe(self, file_name): - """Read a previously saved pandas dataframe. - - User can only read previously saved data file with the same authentication - email. + def pandas(self, col_names=None, ignore_populations=False): + """ Returns a copy of the data in this view as a Pandas DataFrame. Args: - file_name: The saved file name. + col_names: An optional list specifying which columns to extract. + ignore_populations: Ignores all columns that have type + StatisticalPopulation. col_names takes precedence over this argument - Returns: - A pandas dataframe. - - Raises: - RuntimeError: when failed to read the dataframe. + Returns: A deep copy of the underlying Pandas DataFrame. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - try: - response = self._service.read_dataframe(file_name=file_name).execute() - except Exception as e: # pylint: disable=broad-except - raise RuntimeError('Failed to read dataframe: {}'.format(e)) - return pd.read_json(json.loads(response['data']), dtype=False) + if not col_names: + col_names = list(self._dataframe) + if ignore_populations: + col_names = list(filter(lambda name: self._col_types[name] != 'StatisticalPopulation', col_names)) + return self._dataframe[col_names].copy() - def save_dataframe(self, pd_dataframe, file_name): - """Saves pandas dataframe for later retrieving. - - Each aunthentication email has its own scope for saved dataframe. Write - with same file_name overwrites previously saved dataframe. + def csv(self, col_names=None): + """ Returns the data in this view as a CSV string. Args: - pd_dataframe: A pandas.DataFrame. - file_name: The saved file name. + col_names: An optional list specifying which columns to extract. - Raises: - RuntimeError: when failed to save the dataframe. + Returns: + The DataFrame exported as a CSV string. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - data = json.dumps(pd_dataframe.to_json()) - try: - self._service.save_dataframe(body={ - 'data': data, - 'object_name': file_name - }).execute() - except Exception as e: # pylint: disable=broad-except - raise RuntimeError('Failed to save dataframe: {}'.format(e)) - - # -------------------------- OTHER QUERY FUNCTIONS -------------------------- + if col_names: + return self._dataframe[col_names].to_csv(index=False) + return self._dataframe.to_csv(index=False) - def get_cities(self, state, new_col_name, max_rows=100): - """Get a list of city dcids in a given state. + def tsv(self, col_names=None): + """ Returns the data in this view as a TSV string. Args: - state: Name of the state name. - new_col_name: Column name for the returned city column. - max_rows: Max number of returend rows. + col_names: An optional list specifying which columns to extract. Returns: - A pandas.DataFrame with city dcids. + The DataFrame exported as a TSV string. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - query = ('SELECT ?{new_col_name},' - 'typeOf ?node City,' - 'dcid ?node ?{new_col_name},' - 'containedInPlace ?node ?county,' - 'containedInPlace ?county ?state,' - 'name ?state "{state}"').format( - new_col_name=new_col_name, state=state) - type_row = pd.DataFrame(data=[{new_col_name: 'City'}]) + if col_names: + return self._dataframe[col_names].to_csv(index=False, sep='\t') + return self._dataframe.to_csv(index=False, sep='\t') - try: - dcid_column = self.query(query, max_rows) - except RuntimeError as e: - raise RuntimeError('Execute query\n%s\ngot an error:\n%s' % (query, e)) + def rename(self, labels): + """ Renames the columns of the DCFrame. + + Args: + labels: A map from current to new column names. + """ + col_types = {} + for col in self._dataframe: + col_name = col + if col in labels: + col_name = labels[col] + col_types[col_name] = self._col_types[col] + self._col_types = col_types + self._dataframe = self._dataframe.rename(index=str, columns=labels) + + def add_column(self, col_name, col_type, col_vals): + """ Adds a column containing the given values of the given type. + + Args: + col_name: The name of the column + col_type: The type of the column + col_vals: The values in the given column + """ + self._col_types[col_name] = col_type + self._dataframe[col_name] = col_vals - return pd.concat([type_row, dcid_column], ignore_index=True) + def expand(self, property, seed_col_name, new_col_name, new_col_type=None, outgoing=True, rows=100): + """ Creates a new column containing values for the given property. - def get_states(self, country, new_col_name, max_rows=100): - """Get a list of state dcids. + For each entity in the given seed column, queries for entities related to + the seed entity via the given property. Results are stored in a new column + under the provided name. The seed column should contain only DCIDs. Args: - country: A string of the country states contained in. - new_col_name: Column name for the returned state column. - max_rows: max number of returend results. + property: The property to add to the table. + seed_col_name: The column name that contains dcids that the added + properties belong to. + new_col_name: The new column name. + new_col_type: The type contained by the new column. Provide this if the + type is not immediately inferrable. + outgoing: Set this flag if the seed property points away from the entities + denoted by the seed column. That is the seed column serve as subjects + in triples formed with the given property. + rows: The maximum number of rows returned by the query results. - Returns: - A pandas.DataFrame with state dcids. + Raises: + ValueError: when input argument is not valid. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - query = ('SELECT ?{new_col_name},' - 'typeOf ?node State,' - 'dcid ?node ?{new_col_name},' - 'containedInPlace ?node ?country,' - 'name ?country "{country}"').format( - new_col_name=new_col_name, country=country) - type_row = pd.DataFrame(data=[{new_col_name: 'State'}]) - - try: - dcid_column = self.query(query, max_rows) - except RuntimeError as e: - raise RuntimeError('Execute query %s got an error:\n%s' % (query, e)) + if seed_col_name not in self._dataframe: + raise ValueError( + 'Expand error: {} is not a valid seed column.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError( + 'Expand error: {} is already a column.'.format(new_col_name)) - return pd.concat([type_row, dcid_column], ignore_index=True) + # Get the seed column information + seed_col = self._dataframe[seed_col_name] + seed_col_type = self._col_types[seed_col_name] + if seed_col_type == 'Text': + raise ValueError( + 'Expand error: {} must contain DCIDs'.format(seed_col_name)) + + # Determine the new column type + if new_col_type is None: + new_col_type = self._client.property_type(seed_col_type, property, outgoing=outgoing) + if new_col_type is None and outgoing: + new_col_type = 'Text' + elif new_col_type is None: + raise ValueError( + 'Expand error: {} does not have incoming property {}'.format(seed_col_type, property)) + + # Get the list of DCIDs to query for + dcids = list(seed_col) + if not dcids: + # All entries in the seed column are empty strings. The new column should + # contain no entries. + self._dataframe[new_col_name] = '' + self._col_types[new_col_name] = new_col_type + return + + # Construct the query + seed_col_var = '?' + seed_col_name.replace(' ', '_') + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + query = utils.DatalogQuery() + query.add_variable(seed_col_var, new_col_var) + query.add_constraint('?node', 'typeOf', seed_col_type) + query.add_constraint('?node', 'dcid', dcids) + query.add_constraint('?node', 'dcid', seed_col_var) + if outgoing: + query.add_constraint('?node', property, new_col_var) + else: + query.add_constraint(new_col_var, property, '?node') + # Create a new DCFrame and merge it in + new_frame = DCFrame(datalog_query=query, rows=rows, labels=labels, type_hint=type_hint) + self.merge(new_frame) - def get_places_in(self, place_type, container_dcid, col_name, max_rows=100): - """Get a list of places that are contained in a higher level geo places. + def merge(self, frame, how='left', default=''): + """ Joins the given frame into the current frame along shared column names. Args: - place_type: The place type, like "City". - container_dcid: The dcid of the container place. - col_name: Column name for the returned state column. - max_rows: max number of returend results. + frame: The DCFrame to merge in. + how: Optional argument specifying the joins type to perform. Valid types + include 'left', 'right', 'inner', and 'outer' + default: The default place holder for an empty cell produced by the join. - Returns: - A pandas.DataFrame with dcids of the contained place. + Raises: + ValueError: if the given arguments are not valid. This may include either + the given or current DCFrame does not contain the columns specified. """ - assert self._inited, 'Initialization was unsuccessful, cannot execute Query' - assert place_type in _PLACES, 'Input place types are not supported' - - # Get the type of the container place. - type_query = 'SELECT ?type, dcid ?node {dcid}, subType ?node ?type'.format( - dcid=container_dcid) - query_result = self.query(type_query) - assert query_result['type'].count() == 1, ( - 'Type of the container dcid not found') - container_type = query_result['type'][0] - - # Sanity check the type information. - place_type_ind = _PLACES.index(place_type) - container_type_ind = _PLACES.index(container_type) - assert container_type_ind > place_type_ind, ( - 'Requested place type should be of a lower level than the container') - - # Do the actual query. - query = ('SELECT ?{col_name},' - 'typeOf ?node_{place_type} {place_type},' - 'dcid ?node_{place_type} ?{col_name},').format( - col_name=col_name, - place_type=place_type) - for i in range(place_type_ind, container_type_ind): - query += 'containedInPlace ?node_{child} ?node_{parent},'.format( - child=_PLACES[i], parent=_PLACES[i+1]) - query += 'dcid ?node_{container_type} "{container_dcid}"'.format( - container_type=container_type, container_dcid=container_dcid) - try: - dcid_column = self.query(query, max_rows) - except RuntimeError as e: - raise RuntimeError('Execute query %s got an error:\n%s' % (query, e)) - - type_row = pd.DataFrame(data=[{col_name: place_type}]) - return pd.concat([type_row, dcid_column], ignore_index=True) + merge_on = set(self.columns()) & set(frame.columns()) + merge_on = list(merge_on) + + # If the current dataframe is empty, select the given dataframe. If the + # tables have no columns in common, perform a cross join. Otherwise join on + # common columns. + if self._dataframe.empty: + self._col_types = {} + self._dataframe = frame._dataframe + elif len(merge_on) == 0: + # Construct a unique dummy column name + cross_on = ''.join(self.columns() + frame.columns()) + + # Perform the cross join + curr_frame = self._dataframe.assign(**{cross_on: 1}) + new_frame = frame._dataframe.assign(**{cross_on: 1}) + merged = curr_frame.merge(new_frame) + self._dataframe = merged.drop(cross_on, 1) + else: + # Verify that columns being merged have the same type + for col in merge_on: + if self._col_types[col] != frame._col_types[col]: + raise ValueError( + 'Merge error: columns type mismatch for {}.\n Current: {}\n Given: {}'.format(col, self._col_types[col], frame._col_types[col])) + # Merge dataframe, column types, and property maps + self._dataframe = self._dataframe.merge(frame._dataframe, how=how, left_on=merge_on, right_on=merge_on) + self._dataframe = self._dataframe.fillna(default) - # ------------------------ INTERNAL HELPER FUNCTIONS ------------------------ + # Merge the types + self._col_types.update(frame._col_types) - def _query_and_merge(self, - pd_table, - query, - seed_col_name, - new_col_name, - new_col_type, - max_rows=100): - """A utility function that executes the given query and adds a new column. + def clear(self): + """ Clears all the data stored in this extension. """ + self._col_types = {} + self._dataframe = pd.DataFrame() - It sends an request to the API server to execute the given query and joins - a new column with the result and type data along with the values in the seed - column. + def save(self, file_name): + """ Saves the current DCFrame to the DataCommons cache with given file name. Args: - pd_table: A Pandas dataframe where the new data will be added. - query: The query to be executed. This query must output a column with the - same name as "seed_col_name" - seed_col_name: The name of the seed column (i.e. the column to join the - new data against). - new_col_name: The name of the new column. - new_col_type: The type of the entities contained in the new column. - max_rows: The maximum number of rows returned by the query results. + file_name: The name used to store the current DCFrame. Returns: - A pandas.DataFrame with an additional column containing the result of the - query joined with elements in the seed column. - """ - try: - query_result = self.query(query, max_rows=max_rows) - except RuntimeError as e: - raise RuntimeError('Execute query \n%s\ngot an error:\n%s' % (query, e)) + The file name that the - new_data = pd.merge( - pd_table[1:], query_result, how='left', on=seed_col_name) - new_data[new_col_name] = new_data[new_col_name].fillna('') - new_type_row = pd_table.loc[0].to_frame().T - new_type_row[new_col_name] = new_col_type + Raises: + RuntimeError: when failed to save the dataframe. + """ + assert self._client._inited, 'Initialization was unsuccessful, cannot execute Query' - return pd.concat([new_type_row, new_data], ignore_index=True) + # Saves the DCFrame to cache + data = json.dumps({ + 'dataframe': self._dataframe.to_json(), + 'col_types': self._col_types + }) + try: + response = self._client._service.save_dataframe(body={ + 'data': data, + 'file_name': file_name + }).execute() + except Exception as e: # pylint: disable=broad-except + raise RuntimeError('Failed to save dataframe: {}'.format(e)) + return response['file_name'] diff --git a/datacommons/examples/BUILD.bazel b/datacommons/examples/BUILD.bazel index c839b120..c99a3c2b 100644 --- a/datacommons/examples/BUILD.bazel +++ b/datacommons/examples/BUILD.bazel @@ -1,13 +1,13 @@ load("@requirements//:requirements.bzl", "requirement") -py_binary( - name = "add_property_query", - srcs = ["add_property_query.py"], - deps = [ - "//datacommons:datacommons", - requirement("pandas"), - ] -) +#py_binary( +# name = "add_property_query", +# srcs = ["add_property_query.py"], +# deps = [ +# "//datacommons:datacommons", +# requirement("pandas"), +# ] +#) py_binary( name = "get_instances", @@ -19,8 +19,8 @@ py_binary( ) py_binary( - name = "population_analysis", - srcs = ["population_analysis.py"], + name = "analysis_populations", + srcs = ["analysis_populations.py"], deps = [ "//datacommons:datacommons", requirement("pandas"), @@ -28,8 +28,8 @@ py_binary( ) py_binary( - name = "simple_query", - srcs = ["simple_query.py"], + name = "query_basic", + srcs = ["query_basic.py"], deps = [ "//datacommons:datacommons", requirement("pandas"), @@ -37,8 +37,8 @@ py_binary( ) py_binary( - name = "weather_analysis", - srcs = ["weather_analysis.py"], + name = "analysis_weather", + srcs = ["analysis_weather.py"], deps = [ "//datacommons:datacommons", requirement("pandas"), diff --git a/datacommons/examples/add_property_query.py b/datacommons/examples/add_property_query.py deleted file mode 100644 index d4f83a3b..00000000 --- a/datacommons/examples/add_property_query.py +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Example query demonstrating expand API. - -Adds properties in the incoming and outgoing direction by building a table of -all counties contained in the United States. -""" - -import datacommons -import pandas as pd - - -def main(): - dc = datacommons.Client() - - # Start with all states in the United States and add the state names. This - # is an outgoing property of State. - pd_state = dc.get_places_in( - place_type='State', - container_dcid='dc/2sffw13', # United States - col_name='state') - pd_state = dc.expand(pd_state, 'name', 'state', 'state_name', outgoing=True) - - # Add information for counties contained in states in the 'state' column. - # Getting the county is an incoming property of State. Note that there are - # roughly 3100 counties in the United States - pd_state = dc.expand( - pd_state, - 'containedInPlace', - 'state', - 'county', - outgoing=False, - max_rows=50) - pd_state = dc.expand( - pd_state, 'name', 'county', 'county_name', outgoing=True, max_rows=50) - - # Print out the final data frame - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print pd_state - - - pd_city = dc.get_places_in( - place_type='City', - container_dcid='dc/b72vdv', # California - col_name='city') - pd_city = dc.expand(pd_city, 'name', 'city', 'city_name', outgoing=True) - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print pd_city - -if __name__ == '__main__': - main() diff --git a/datacommons/examples/analysis_populations.py b/datacommons/examples/analysis_populations.py new file mode 100644 index 00000000..8af818ed --- /dev/null +++ b/datacommons/examples/analysis_populations.py @@ -0,0 +1,98 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example analysis with DataCommons Python API. + +Note to use this API code in an Colab or another iPython notebook environment +add the following code: +pip install --upgrade numpy +pip install --upgrade pandas +pip install --upgrade git+https://github.com/ACscooter/datacommons.git@feature/api-version-2 +""" + +import datacommons +from datacommons.utils import DatalogQuery +from datacommons.populations import PopulationsExtension + +import pandas as pd + +# Print options +pd.set_option('display.max_colwidth', -1) +pd.set_option('display.width', 1000) +pd.set_option('display.max_columns', 50) +pd.set_option('display.max_rows', 20) + +# Helper function for formatting table printing +def print_pandas(example_num, df): + print('-'*80) + print('EXAMPLE {}'.format(example_num)) + print('-'*80 + '\n') + print(df) + print('\n') + +def main(): + frame_1 = datacommons.DCFrame() # establish generic df + frame_1 = PopulationsExtension(frame_1) # add population features to df + + # Start by initializing a column of three US states: California, Kentucky, and + # Maryland. + frame_1.add_column('state_dcid', 'State', ['geoId/06', 'geoId/21', 'geoId/24']) + print_pandas(1, frame_1.pandas()) + + # Name is an outgoing property of the State. We can call expand to populate a + # column 'state_name' with names of states corresponding to dcids in the + # 'state_dcid' column. + frame_1.expand('name', 'state_dcid', 'state_name') + + # Get populations for state + frame_1.get_populations( + seed_col_name='state_dcid', + new_col_name='state_population', + population_type='Person', + rows=100) + print_pandas(2, frame_1.pandas()) + + frame_1.get_populations( + seed_col_name='state_dcid', + new_col_name='state_male_population', + population_type='Person', + rows=100, + gender='Male') + print_pandas(3, frame_1.pandas()) + + frame_1.get_populations( + seed_col_name='state_dcid', + new_col_name='state_female_population', + population_type='Person', + rows=100, + gender='Female') + print_pandas(3, frame_1.pandas()) + + # Get observations on state populations + frame_1.get_observations( + seed_col_name='state_population', + new_col_name='state_person_2016_count', + observation_date='2016', + measured_property='count') + print_pandas(4, frame_1.pandas()) + + # To ignore the population columns... + print_pandas(5, frame_1.pandas(ignore_populations=True)) + + # Print the max population count + print('Max population count...') + print(frame_1.pandas()['state_person_2016_count'].max()) + + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/weather_analysis.py b/datacommons/examples/analysis_weather.py similarity index 100% rename from datacommons/examples/weather_analysis.py rename to datacommons/examples/analysis_weather.py diff --git a/datacommons/examples/bio_basic.py b/datacommons/examples/bio_basic.py new file mode 100644 index 00000000..7bbd16b3 --- /dev/null +++ b/datacommons/examples/bio_basic.py @@ -0,0 +1,141 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Basic demo showcasing how to use the Bio extension + +Note to use this API code in an Colab or another iPython notebook environment +add the following code: +pip install --upgrade numpy +pip install --upgrade pandas +pip install --upgrade git+https://github.com/ACscooter/datacommons.git@api-version-2 +""" + +from datacommons.utils import DatalogQuery +from datacommons.bio import BioExtension + +import pandas as pd + +import datacommons + +# Print options +pd.set_option('display.max_colwidth', -1) +pd.set_option('display.width', 1000) +pd.set_option('display.max_columns', 50) +pd.set_option('display.max_rows', 20) + +# Helper function for formatting table printing +def print_pandas(example_num, df): + print('-'*80) + print('EXAMPLE {}'.format(example_num)) + print('-'*80 + '\n') + print(df) + print('\n') + +def main(): + # Example 1: query for experiments that have a specific assay specified. + frame_1 = datacommons.DCFrame() + frame_1 = BioExtension(frame_1) + frame_1.get_experiments('Experiment', + assay_category=['Transcription'], + bio_class=['cell line'], + bio_term=['K562'], + rows=1000) + frame_1.expand('description', 'Experiment', 'Description') + print_pandas(1, frame_1.pandas()) + + # Example 2: query for experiments published by a specific lab + frame_2 = datacommons.DCFrame() + frame_2 = BioExtension(frame_2) + frame_2.get_experiments('Experiment', lab_name=['yijun-ruan']) + frame_2.expand('description', 'Experiment', 'Description') + print_pandas(2, frame_2.pandas()) + + # Example 3: query for experiments by assembly + frame_3 = datacommons.DCFrame() + frame_3 = BioExtension(frame_3) + frame_3.get_experiments('Experiment', assembly=['hg19'], rows=1000) + frame_3.expand('description', 'Experiment', 'Description') + print_pandas(3, frame_3.pandas()) + + # Example 4: query for experiments by multiple biosample summaries + frame_4 = datacommons.DCFrame() + frame_4 = BioExtension(frame_4) + frame_4.get_experiments('Experiment', + bio_class=['primary cell', 'cell line'], + bio_term=['endothelial cell of umbilical vein', 'K562'], + rows=1000) + frame_4.expand('description', 'Experiment', 'Description') + print_pandas(4, frame_4.pandas()) + + # Example 5: query for bed files associated with a given column of experiments + frame_5 = datacommons.DCFrame() + frame_5 = BioExtension(frame_5) + frame_5.get_experiments('Experiment', + assay_category=['Transcription'], + bio_class=['cell line'], + bio_term=['K562']) + frame_5.get_bed_files('Experiment', 'BedFile IDs') + frame_5.expand('description', 'Experiment', 'Description') + frame_5.expand('name', 'BedFile IDs', 'BedFile Names') + print_pandas(5, frame_5.pandas()) + + # Example 6: get all BedLines associated with a BedFile + frame_6 = datacommons.DCFrame() + frame_6 = BioExtension(frame_6) + frame_6.get_experiments('Experiment', + assay_category=['Transcription'], + bio_class=['cell line'], + bio_term=['K562'], + rows=1) + frame_6.get_bed_files('Experiment', 'BedFile IDs') + frame_6.get_bed_lines('BedFile IDs') + print_pandas(6, frame_6.pandas()) + + # Example 7: getting other fields in extended Bed files. One thing to notice + # is that we can still filter by the chromosome even if we're not populating + # a column with its values. + prop_info = [ + ('StartPos', 'chromosomeStart', 'Integer'), + ('EndPos', 'chromosomeEnd', 'Integer'), + ('RGBValue', 'itemRGB', 'Text'), + ('ThickStart', 'thickStart', 'Integer'), + ('ThickEnd', 'thickEnd', 'Integer'), + ] + frame_7 = datacommons.DCFrame() + frame_7 = BioExtension(frame_7) + frame_7.get_experiments('Experiment', lab_name=['yijun-ruan'], rows=40) + frame_7.get_bed_files('Experiment', 'BedFile IDs', rows=40) + frame_7.expand('name', 'BedFile IDs', 'BedFileName', rows=40) + frame_7.get_bed_lines('BedFile IDs', + prop_info=prop_info, + chromosome=['chr7'], + rows=1000) + print_pandas(7, frame_7.pandas()) + + # Example 8: querying bedlines from multiple files and filtering by chromosome + # and start-end range. + # NOTE: This query takes a bit longer than others due to its size. + frame_8 = datacommons.DCFrame() + frame_8 = BioExtension(frame_8) + frame_8.get_experiments('Experiment', lab_name=['yijun-ruan'], rows=40) + frame_8.get_bed_files('Experiment', 'BedFile IDs', rows=40) + frame_8.expand('name', 'BedFile IDs', 'BedFileName', rows=40) + frame_8.get_bed_lines('BedFile IDs', + chromosome=['chr7'], + start_pos=[5000000], + end_pos=[10000000], + rows=500000) + print_pandas(8, frame_8.pandas()) + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/dcframe_basic.py b/datacommons/examples/dcframe_basic.py new file mode 100644 index 00000000..e14c5c8f --- /dev/null +++ b/datacommons/examples/dcframe_basic.py @@ -0,0 +1,53 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Basic demo for the DCFrame. + +This demo showcases basic features of the DCFrame class. +- Initializing a frame from query +- Relabeling the columns +- Getting column names and types +- Getting a Pandas DataFrame view +- Getting csv and tsv strings of the frame +""" + +from datacommons.utils import DatalogQuery + +import datacommons + + +def main(): + # Create the query + query = DatalogQuery() + query.add_variable('?id', '?lat', '?lon') + query.add_constraint('?node', 'typeOf', 'City') + query.add_constraint('?node', 'name', '"San Luis Obispo"') + query.add_constraint('?node', 'dcid', '?id') + query.add_constraint('?node', 'latitude', '?lat') + query.add_constraint('?node', 'longitude', '?lon') + + # Create the frame + labels = {'?id': 'city', '?lat': 'latitude', '?lon': 'longitude'} + frame = datacommons.DCFrame(datalog_query=query, labels=labels) + + print('> Columns\t{}'.format(frame.columns())) + print('> Col types\t{}'.format(frame.types())) + print('> Pandas frame\n') + print(frame.pandas()) + print('\n> CSV string\n') + print(frame.csv()) + print('\n> TSV string\n') + print(frame.tsv()) + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/dcframe_cache.py b/datacommons/examples/dcframe_cache.py new file mode 100644 index 00000000..cbdb3d8e --- /dev/null +++ b/datacommons/examples/dcframe_cache.py @@ -0,0 +1,52 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Demo showcasing DataCommons caching features. + +This demo shows how to use the DataCommons caching service to store and load +DCFrames that have been created. The are associated with your user +authenticated email. +""" + +from datacommons.utils import DatalogQuery + +import datacommons + + +def main(): + # Create the query + query = DatalogQuery() + query.add_variable('?id', '?lat', '?lon') + query.add_constraint('?node', 'typeOf', 'City') + query.add_constraint('?node', 'name', '"San Luis Obispo"') + query.add_constraint('?node', 'dcid', '?id') + query.add_constraint('?node', 'latitude', '?lat') + query.add_constraint('?node', 'longitude', '?lon') + + # Create the frame + labels = {'?id': 'city', '?lat': 'latitude', '?lon': 'longitude'} + frame = datacommons.DCFrame(datalog_query=query, labels=labels) + + # Save the dataframe + saved_name = frame.save('test_df') + print('> Saving dataframe to {}'.format(saved_name)) + print(frame.pandas()) + + # Read a new frame from the saved version + saved_frame = datacommons.DCFrame(file_name=saved_name) + print('\n> Reading from {}'.format(saved_name)) + print(frame.pandas()) + + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/dcframe_expand.py b/datacommons/examples/dcframe_expand.py new file mode 100644 index 00000000..4c352d8e --- /dev/null +++ b/datacommons/examples/dcframe_expand.py @@ -0,0 +1,72 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Demo showcasing DataCommons caching features. + +This demo shows how to use the DataCommons expand feature to incrementally +build you DCFrame. The expand feature is very useful for adding data from +other properties in the DataCommons graph. +""" + +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Example query demonstrating expand API. + +Adds properties in the incoming and outgoing direction by building a table of +counties contained in the United States. +""" + + +from datacommons.utils import DatalogQuery + +import datacommons + + +def main(): + frame = datacommons.DCFrame() + + # Start by initializing a column of three US states: California, Kentucky, and + # Maryland. + frame.add_column('state_dcid', 'State', ['geoId/06', 'geoId/21', 'geoId/24']) + + # Name is an outgoing property of the State. We can call expand to populate a + # column 'state_name' with names of states corresponding to dcids in the + # 'state_dcid' column. + frame.expand('name', 'state_dcid', 'state_name') + print(frame.pandas()) + + # We can also use expand to traverse incoming properties. To get all Counties + # contained in States, we construct a column of county dcids using the + # containedInPlace property pointing into State. This requires a type hint for + # as multiple types can be containedInPlace of a State. + frame.expand('containedInPlace', 'state_dcid', 'county_dcid', new_col_type='County', outgoing=False) + print(frame.pandas()) + + # Finally, we populate a column of County names. + frame.expand('name', 'county_dcid', 'county_name') + print(frame.pandas()) + + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/dcframe_manipulation.py b/datacommons/examples/dcframe_manipulation.py new file mode 100644 index 00000000..f32d619e --- /dev/null +++ b/datacommons/examples/dcframe_manipulation.py @@ -0,0 +1,54 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Demo showcasing the DCFrame manipulation workflow + +When constructing a DCFrame from a query, the frame can be manipulated using +the 'select' and 'process' argument. +""" + +from datacommons.utils import DatalogQuery + +import datacommons + +def main(): + # We begin with a query that gets States in the datacommons graph. + query = DatalogQuery() + query.add_variable('?stateName') + query.add_constraint('?state', 'typeOf', 'State') + query.add_constraint('?state', 'name', '?stateName') + frame = datacommons.DCFrame(datalog_query=query) + print('> Querying for states') + print(frame.pandas()) + + # If we want to perform the same query but select states that start with the + # letter 'A', then we can provide a selector function to the frame constructor + # that returns True iff the row is to be selected. + select = lambda row: row['stateName'].startswith('A') + frame_select = datacommons.DCFrame(datalog_query=query, select=select) + print('\n> States that begin with "A"') + print(frame_select.pandas()) + + # Perhaps we also want to capitalize all State names returned in the query. + # This can be done by specifying a post processing function that †akes in the + # resulting table as an argument. + def process(frame): + frame.columns = map(lambda row: str(row).upper(), frame.columns) + return frame + frame_process = datacommons.DCFrame(datalog_query=query, select=select, process=process) + print('\n> States that begin with "A" capitalized') + print(frame_process.pandas()) + + +if __name__ == '__main__': + main() diff --git a/datacommons/examples/population_analysis.py b/datacommons/examples/population_analysis.py deleted file mode 100644 index 3fc843d8..00000000 --- a/datacommons/examples/population_analysis.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2017 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Example analysis with DataCommons Python API. - -""" - -import datacommons -import pandas as pd - - -def main(): - dc = datacommons.Client() - - # Build a table with a single US state - state_table = dc.get_states('United States', 'state', max_rows=1) - - # Add the state name and the 5 counties contained in that state - state_table = dc.expand( - state_table, 'name', 'state', 'state_name', outgoing=True) - state_table = dc.expand( - state_table, - 'containedInPlace', - 'state', - 'county', - outgoing=False, - max_rows=3) - state_table = dc.expand( - state_table, 'name', 'county', 'county_name', outgoing=True) - - state_table = dc.get_populations( - state_table, - seed_col_name='county', - new_col_name='county_population', - population_type='Person', - max_rows=100) - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print state_table - - state_table = dc.get_populations( - state_table, - seed_col_name='county', - new_col_name='county_18_24_years_population', - population_type='Person', - max_rows=100, - age='USC/18To24Years') - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print state_table - - state_table = dc.get_populations( - state_table, - seed_col_name='county', - new_col_name='county_male_population', - population_type='Person', - max_rows=100, - gender='Male') - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print state_table - - state_table = dc.get_observations( - state_table, - seed_col_name='county_population', - new_col_name='county_person_count', - start_date='2012-01-01', - end_date='2016-01-01', - measured_property='count', - stats_type='count') - - with pd.option_context('display.width', 400, 'display.max_rows', 100): - print state_table - - -if __name__ == '__main__': - main() diff --git a/datacommons/examples/simple_query.py b/datacommons/examples/query_basic.py similarity index 87% rename from datacommons/examples/simple_query.py rename to datacommons/examples/query_basic.py index 0bf774d4..60194312 100644 --- a/datacommons/examples/simple_query.py +++ b/datacommons/examples/query_basic.py @@ -27,14 +27,14 @@ def main(): dc = datacommons.Client() # Get lat/long of a city. - query = (""" + query = (''' SELECT ?id ?lat ?long, typeOf ?o City, - name ?o 'San Luis Obispo', + name ?o "San Luis Obispo", dcid ?o ?id, latitude ?o ?lat, longitude ?o ?long - """) + ''') print('Issuing query "{}"'.format(query)) try: df = dc.query(query) @@ -45,10 +45,6 @@ def main(): with pd.option_context('display.width', 400, 'display.max_rows', 100): print(df) - dc.save_dataframe(df, 'test_df') - saved_df = dc.read_dataframe('test_df') - assert df.equals(saved_df) - if __name__ == '__main__': main() diff --git a/datacommons/examples/weather_basic.py b/datacommons/examples/weather_basic.py new file mode 100644 index 00000000..f2a17db7 --- /dev/null +++ b/datacommons/examples/weather_basic.py @@ -0,0 +1,75 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Basic demo showing how to use the weather extension +""" + +from datacommons.utils import DatalogQuery +from datacommons.weather import WeatherExtension +from datacommons.utils import MeasuredValue + +import pandas as pd + +import datacommons + +# Print options +pd.set_option('display.max_colwidth', -1) +pd.set_option('display.width', 1000) +pd.set_option('display.max_columns', 50) +pd.set_option('display.max_rows', 20) + +def display_table(num, label, df): + print('-'*80) + print('EXAMPLE {}: {}'.format(num, label)) + print('-'*80 + '\n') + print(df) + print('\n') + +def main(): + # Create a list of places to get weather. + places = [ + 'geoId/4261000', # Pittsburgh, PA + 'geoId/0649670', # Mountain View, CA + 'geoId/4805000', # Austin, TX + 'geoId/0606000', # Berkeley, CA + ] + + # Example 1: Getting the temperature + frame_1 = datacommons.DCFrame() + frame_1 = WeatherExtension(frame_1) + frame_1.add_column('CityID', 'City', places) + frame_1.expand('name', 'CityID', 'CityName', new_col_type='Text') + frame_1.get_temperature('CityID', 'MeanTemp', MeasuredValue.MEAN, date='2019-05-09') + frame_1.get_temperature('CityID', 'MinTemp', MeasuredValue.MIN, date='2019-05-09') + display_table(1, 'Temperature', frame_1.pandas()) + + # Example 2: Getting the rainfall + frame_2 = datacommons.DCFrame() + frame_2 = WeatherExtension(frame_2) + frame_2.add_column('CityID', 'City', places) + frame_2.expand('name', 'CityID', 'CityName', new_col_type='Text') + frame_2.get_rainfall('CityID', 'MeanRain', MeasuredValue.MEAN, date='2019-05-09') + frame_2.get_rainfall('CityID', 'MinRain', MeasuredValue.MIN, date='2019-05-09') + display_table(2, 'Rainfall', frame_2.pandas()) + + # Example 3: Getting the visibility + frame_3 = datacommons.DCFrame() + frame_3 = WeatherExtension(frame_3) + frame_3.add_column('CityID', 'City', places) + frame_3.expand('name', 'CityID', 'CityName', new_col_type='Text') + frame_3.get_visibility('CityID', 'MeanVisibility', MeasuredValue.MEAN, date='2019-05-09') + display_table(3, 'Visibility', frame_3.pandas()) + + +if __name__ == '__main__': + main() diff --git a/datacommons/places.py b/datacommons/places.py new file mode 100644 index 00000000..1aad153f --- /dev/null +++ b/datacommons/places.py @@ -0,0 +1,97 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DataCommons Places data API Extension. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from types import MethodType +from .datacommons import DCFrame +from . import utils + +_PLACES = { + 'City': 'County', + 'SchoolDistrict': 'County', + 'CensusTract': 'County', + 'County': 'State', + 'State': 'Country', + 'Country': 'Continent' +} + +def PlacesExtension(frame): + """ The DataCommons places API extension. """ + frame.get_places_in = MethodType(get_places_in, frame) + return frame + +def get_places_in(self, seed_col_name, new_col_name, new_col_type, rows=100): + """ Adds a new column to the frame places contained in seed column entities. + + Args: + seed_col_name: The column name containing DCIDs to get contained entities. + new_col_name: The column name for where the results are stored. + new_col_type: The type of place to query for. + rows: max number of returend results. + Returns: + A pandas.DataFrame with dcids of the contained place. + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a valid seed column.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError('{} is already a column name.'.format(new_col_name)) + if new_col_type not in _PLACES: + raise ValueError('Place type {} is not supported.'.format(new_col_type)) + + # Get the variable names + seed_col_var = '?' + seed_col_name.replace(' ', '_') + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + + # Get the type of the container place + seed_col_type = self._col_types[seed_col_name] + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + # Get allowed DCIDs + dcids = list(self._dataframe[seed_col_name]) + if not dcids: + # All entries in the seed column are empty strings. The new column should + # contain no entries. + self._dataframe[new_col_name] = '' + self._col_types[new_col_name] = new_col_type + return + + # Construct the query + query = utils.DatalogQuery() + query.add_variable(seed_col_var, new_col_var) + query.add_constraint('?node{}'.format(new_col_type), 'typeOf', new_col_type) + query.add_constraint('?node{}'.format(new_col_type), 'dcid', new_col_var) + + # Construct chain of parent types + curr_type = new_col_type + parent_type = _PLACES[curr_type] + while curr_type != seed_col_type: + query.add_constraint('?node{}'.format(curr_type), 'containedInPlace', '?node{}'.format(parent_type)) + curr_type = parent_type + if curr_type not in _PLACES: + raise ValueError('{} is not contained in {}.'.format(new_col_type, seed_col_type)) + parent_type = _PLACES[curr_type] + + query.add_constraint('?node{}'.format(seed_col_type), 'dcid', dcids) + query.add_constraint('?node{}'.format(seed_col_type), 'dcid', seed_col_var) + + # Perform the query and merge the results + new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows) + self.merge(new_frame) diff --git a/datacommons/populations.py b/datacommons/populations.py new file mode 100644 index 00000000..b9183320 --- /dev/null +++ b/datacommons/populations.py @@ -0,0 +1,191 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Data Commons Populations API Extension. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from types import MethodType +from .datacommons import DCFrame +from . import utils + +def PopulationsExtension(frame): + """ The DataCommons populations API extension. + Allows users to do frame.function_defined_in_this_extension + as if these extension functions were built in frame funcs. + """ + frame.get_populations = MethodType(get_populations, frame) + frame.get_observations = MethodType(get_observations, frame) + return frame + +def get_populations(self, + seed_col_name, + new_col_name, + population_type, + rows=100, + location_property='location', + **kwargs): + """Create a new column with population dcid, in place. + The current pandas dataframe should include a column containing entity IDs + for geo entities. This function populates a new column with + population dcids corresponding to the geo entities. + + Args: + seed_col_name: The column name that contains entity (ids) that the added + populations belong to. + new_col_name: New column name. + population_type: Population type like "Person". + max_rows: The maximum number of rows returned by the query results. + **kwargs: keyword properties to define the population. + + Raises: + ValueError: when input argument is not valid. + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a valid seed column.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError('{} is already a column name.'.format(new_col_name)) + + # Get the variable names + seed_col_var = '?' + seed_col_name.replace(' ', '_') + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + + # Get the type of the container place + seed_col_type = self._col_types[seed_col_name] + new_col_type = 'StatisticalPopulation' + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + # Get allowed DCIDs + dcids = list(self._dataframe[seed_col_name]) + if not dcids: + # All entries in the seed column are empty strings. The new column should + # contain no entries. + self._dataframe[new_col_name] = '' + self._col_types[new_col_name] = new_col_type + return + + # Construct the query + query = utils.DatalogQuery() + # Specify which variables to SELECT + query.add_variable(seed_col_var, new_col_var) + # Add constraints to the SELECT SQL query + query.add_constraint('?node', 'typeOf', seed_col_type) + query.add_constraint('?pop', 'typeOf', 'StatisticalPopulation') + query.add_constraint('?node', 'dcid', dcids) + query.add_constraint('?node', 'dcid', seed_col_var) + query.add_constraint('?pop', location_property, '?node') + query.add_constraint('?pop', 'dcid', new_col_var) + query.add_constraint('?pop', 'populationType', population_type) + + pv_pairs = sorted(kwargs.items()) + idx = 0 + for idx, pv in enumerate(pv_pairs, 1): + query.add_constraint('?pop', 'p{}'.format(idx), pv[0]) # ? need str(pv[0]) + query.add_constraint('?pop', 'v{}'.format(idx), pv[1]) # ditto + query.add_constraint('?pop', 'numConstraints', idx) + + # Perform the query and merge the results + new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows) + self.merge(new_frame) + +def get_observations(self, + seed_col_name, + new_col_name, + observation_date, + measured_property, + stats_type=None, + clean_data=False, + rows=100): + """Create a new column with values for an observation of the given property. + The current pandas dataframe should include a column containing population + dcids. This function populates a new column with observations of the + populations' measured property. A column containing geo ids of type City + can be used instead of population dcids. + + Args: + seed_col_name: The column that contains the population dcid or city geo id. + new_col_name: New column name. + observations_date: The date of the observation (in 'YYY-mm-dd' form). + measured_property: observation measured property. + stats_type: Statistical type like "Median" + clean_data: A flag to convert to numerical types and filter out any NaNs. + rows: The maximum number of rows returned by the query results. + + Raises: + ValueError: when input argument is not valid. + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a valid seed column.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError('{} is already a column name.'.format(new_col_name)) + + # Get the variable names + seed_col_var = '?' + seed_col_name.replace(' ', '_') + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + + # Get the type of the container place + seed_col_type = self._col_types[seed_col_name] + new_col_type = 'Observation' + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + # Make sure the seed column can have observations + assert seed_col_type == 'StatisticalPopulation' or seed_col_type == 'City', ( + 'Parent entity should be StatisticalPopulation' or 'City') + + # Get allowed DCIDs + dcids = list(self._dataframe[seed_col_name]) + if not dcids: + self._dataframe[new_col_name] = '' + self._col_types[new_col_name] = new_col_type + return + + if stats_type is None: + stats_type = 'measured' + + # Construct the query + query = utils.DatalogQuery() + # Specify which variables to SELECT + query.add_variable(seed_col_var, new_col_var) + # Add constraints to the SELECT SQL query + query.add_constraint('?pop', 'typeOf', seed_col_type) + query.add_constraint('?o', 'typeOf', 'Observation') + query.add_constraint('?pop', 'dcid', dcids) + query.add_constraint('?pop', 'dcid', seed_col_var) + query.add_constraint('?o', 'observedNode', '?pop') + query.add_constraint('?o', 'observationDate', '\"{}\"'.format(observation_date)) + query.add_constraint('?o', 'measuredProperty', measured_property) + query.add_constraint('?o', '{}Value'.format(stats_type), new_col_var) + measurement_method = None + if measured_property == 'prevalence': + measurement_method = 'CDC_CrudePrevalence' + elif measured_property == 'unemploymentRate': + measurement_method = 'BLSSeasonallyUnadjusted' + if measurement_method: + query.add_constraint('?o', 'measurementMethod', measurement_method) + + # Perform the query and merge the results + new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows) + self.merge(new_frame) + + # After the merge is performed, check if cleaning needs to be done + if clean_data: + type_func = utils.convert_type(new_col_name, 'float') + nan_func = utils.drop_nan(new_col_name) + clean_func = utils.compose_process(type_func, nan_func) + self._dataframe = clean_func(self._dataframe) diff --git a/datacommons/utils.py b/datacommons/utils.py new file mode 100644 index 00000000..fc2cc90b --- /dev/null +++ b/datacommons/utils.py @@ -0,0 +1,170 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DataCommons utilities library + +Contains various functions that can aid in the extension of the DataCommons API. +""" + +from collections import OrderedDict + +import pandas as pd + + +class MeasuredValue: + """ An enumeration of valid measured values in the DataCommons graph. + + A measured value in a Statistical Observation is the type of statistic being + measured for example: "mean", "min", "max", etc. The associated string field + is the DataCommons property used to query for the measured value. + """ + MIN = 'minValue' + MAX = 'maxValue' + MEAN = 'meanValue' + + +class DatalogQuery(object): + """ A class wrapping a DataCommons datalog query string. """ + + def __init__(self): + self._variables = [] + self._constraints = OrderedDict() + + def __str__(self): + """ Returns the query stored by this object. """ + query_body = '' + for sub in self._constraints: + for pred in self._constraints[sub]: + for obj in self._constraints[sub][pred]: + if isinstance(obj, list): + obj_vals = ' '.join(obj).strip() + query_body += '{} {} {},\n'.format(pred, sub, obj_vals) + else: + query_body += '{} {} {},\n'.format(pred, sub, obj) + query = 'SELECT {},\n{}'.format(' '.join(self._variables), query_body) + return query + + def variables(self): + """ Returns the set of variables. """ + return self._variables + + def var_types(self): + """ Returns a map from query variable to types specified in the query. """ + # Get initial var types + var_types = {} + for sub in self._constraints: + for pred in self._constraints[sub]: + if pred == 'typeOf': + var_types[sub] = self._constraints[sub][pred][0] + + # If the property is dcid, then the type can be carried over + for sub in self._constraints: + for pred in self._constraints[sub]: + if pred == 'dcid': + for obj in self._constraints[sub][pred]: + if isinstance(obj, str) and sub in var_types: + var_types[obj] = var_types[sub] + return var_types + + def add_variable(self, *variables): + """ Add variables to the query. """ + for var in variables: + if var not in self._variables: + # Maintaining order of the variables is important + self._variables.append(var) + + def add_constraint(self, sub, pred, obj): + """ Add constraints to the query. """ + # Entries in the set correspond to separate lines + if sub not in self._constraints: + self._constraints[sub] = OrderedDict() + if pred not in self._constraints[sub]: + self._constraints[sub][pred] = [] + self._constraints[sub][pred].append(obj) + + +# ------------------------ SELECT AND PROCESS HELPERS ------------------------- + + +def convert_type(col_names, dtype): + """ Converts values in a given column to the given type. + + Args: + col_names: The column or columns to convert + dtype: Data type or a dictionary from column name to data type. + + Returns: A process function that converts the column to a given type. + """ + if isinstance(col_names, str): + col_names = [col_names] + def process(pd_frame): + for name in col_names: + pd_frame[name] = pd.to_numeric(pd_frame[name]) + return pd_frame + return process + +def drop_nan(col_names): + """ Drops rows containing NAN as a value in columns in col_names. + + Args: + col_names: single column name or a list of column names. + """ + if isinstance(col_names, str): + col_names = [col_names] + def process(pd_frame): + return pd_frame.dropna(subset=col_names) + return process + +def delete_column(*cols): + """ Returns a function that deletes the given column from a frame. + + Args: + cols: Columns to delete from the data frame. + + Returns: + A function that deletes columns in the given Pandas DataFrame. + """ + def process(pd_frame): + for col in cols: + if col in pd_frame: + pd_frame = pd_frame.drop(col, axis=1) + return pd_frame + return process + +def compose_select(*select_funcs): + """ Returns a filter function composed of the given selectors. + + Args: + select_funcs: Functions to compose. + + Returns: + A filter function which returns True iff all select_funcs return True. + """ + def select(row): + return all(select_func(row) for select_func in select_funcs) + return select + +def compose_process(*process_funcs): + """ Returns a process function composed of the given functions. + + Args: + process_funcs: Functions to compose. + + Returns: + A process function which performs each function in the order given. + """ + def process(pd_frame): + for process_func in process_funcs: + pd_frame = process_func(pd_frame) + return pd_frame + return process diff --git a/datacommons/weather.py b/datacommons/weather.py new file mode 100644 index 00000000..9123368c --- /dev/null +++ b/datacommons/weather.py @@ -0,0 +1,255 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DataCommons Weather API Extension. + +Potential improvements: +- Include option to also query for the unit measured. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from types import MethodType +from .datacommons import DCFrame +from . import utils + +_PLACES_WITH_WEATHER = ['City'] + +def WeatherExtension(frame): + """ The DataCommons weather API extension. """ + frame.get_temperature = MethodType(get_temperature, frame) + frame.get_visibility = MethodType(get_visibility, frame) + frame.get_rainfall = MethodType(get_rainfall, frame) + frame.get_snowfall = MethodType(get_snowfall, frame) + frame.get_barometric_pressure = MethodType(get_barometric_pressure, frame) + frame.get_humidity = MethodType(get_humidity, frame) + return frame + +def get_temperature(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing temperature data in celsius. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'temperature', + rows=rows, + **kwargs) + + +def get_visibility(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing visibility data in kilometer. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'visibility', + rows=rows, + **kwargs) + +def get_rainfall(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing rainfall data in millimeter. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'rainfall', + rows=rows, + **kwargs) + +def get_snowfall(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing snowfall data in millimeter. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'snowfall', + rows=rows, + **kwargs) + +def get_barometric_pressure(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing barometric pressure data in millibar. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'barometricPressure', + rows=rows, + **kwargs) + +def get_humidity(self, seed_col_name, new_col_name, measured_value, rows=100, **kwargs): + """ Returns column(s) containing barometric pressure data in percent. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date" specified as + "YYYY-MM-DD". + """ + _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + 'humidity', + rows=rows, + **kwargs) + +def _get_weather(self, + seed_col_name, + new_col_name, + measured_value, + weather_type, + rows=100, + **kwargs): + """ Returns a column containing statistics measuring the given weather type. + + Adds a new column to the dataframe containing weather statistics for the + given weather type for the "date" specified in kwargs. + + POTENTIAL USAGE: + # A list of places and date must be provided as parameters with one as the + # seed column and the other in the keyword arguments. + # - If places are provided in the seed column, then "date" must be specified + # in kwargs. + # - If date are provided in the seed column, then "places" must be specified + # in kwargs. + # + # A new column is created for each parameter provided in kwargs. If places + # are provided as the seed column, and date in the kwargs, then a new column + # is created for each date where each row contains the temperature for the + # given date and the row's place. + + Args: + seed_col_name: The name of the seed column. The seed column can either + contain date or location dcids. + new_col_name: The new column's name. + measured_value: The statistic type measured i.e. min / max / mean temp. + weather_type: Can be one of the following: + - "temperature" + - "visibility" + - "rainfall" + - "snowfall" + - "barometricPressure" + - "humidity" + rows: The maximum number of rows to return. + kwargs: Additional keyword arguments include "date". + - The date must be specified as "YYYY-MM-DD" + """ + if seed_col_name not in self._dataframe: + raise ValueError('{} is not a column in the frame.'.format(seed_col_name)) + if new_col_name in self._dataframe: + raise ValueError('{} is already a column.'.format(new_col_name)) + if self._col_types[seed_col_name] not in _PLACES_WITH_WEATHER: + valid_places = ', '.join([place for place in _PLACES_WITH_WEATHER]) + raise ValueError('{} needs to be type of Place e.g. one of {}'.format(seed_col_name, valid_places)) + if 'date' not in kwargs: + raise ValueError('"date" must be specified as a keyword argument.') + + # Get the query variable label map + seed_col_var = '?' + seed_col_name.replace(' ', '_') + new_col_var = '?' + new_col_name.replace(' ', '_') + labels = {seed_col_var: seed_col_name, new_col_var: new_col_name} + + # Get the query variable types + seed_col_type = self._col_types[seed_col_name] + new_col_type = 'Text' + type_hint = {seed_col_var: seed_col_type, new_col_var: new_col_type} + + # Get the query parameters. + place_dcids, date_strings = None, None + if 'date' in kwargs: + place_dcids = list(self._dataframe[seed_col_name]) + date_strings = ['"{}"'.format(date) for date in [kwargs['date']]] + + # NOTE: If we ever want to allow the seed column to contain dates, uncomment + # this block of code. Additionally, remove the list call surrounding + # kwargs['date'] above. + # elif 'places' in kwargs: + # place_dcids = kwargs['places'] + # date_strings = list(self._dataframe[seed_col_name]) + + # Construct the query + query = utils.DatalogQuery() + query.add_variable(seed_col_var, new_col_var) + + # Add the constraints + query.add_constraint('?o', 'typeOf', 'WeatherObservation') + query.add_constraint('?o', 'measuredProperty', '{}'.format(weather_type)) + query.add_constraint('?o', measured_value, new_col_var) + query.add_constraint(seed_col_var, 'dcid', place_dcids) + query.add_constraint('?o', 'observationDate', ' '.join(date_strings)) + if 'date' in kwargs: + query.add_constraint('?o', 'observedNode', seed_col_var) + + # NOTE: If we ever want to allow the seed column to contain dates, uncomment + # this block of code. + # elif 'places' in kwargs: + # query.add_constraint('?o', 'observationDate', seed_col_var) + + new_frame = DCFrame(datalog_query=query, labels=labels, type_hint=type_hint, rows=rows) + self.merge(new_frame) diff --git a/schema/datacomconfig.json b/schema/datacomconfig.json new file mode 100644 index 00000000..9b6cc3a1 --- /dev/null +++ b/schema/datacomconfig.json @@ -0,0 +1,55 @@ +{ + "@context": { + "@vocab": "http://configfiles.schema.org/" + }, + "@type": "DataFeed", + "name": "schema.dataCommons.org", + "prefix": "schemadc", + "siteurl": "https://schema.datacommons.org", + "vocaburl": "https://schema.datacommons.org/", + "atticurl": "https://attic.schema.datacommons.org/", + "dataFeedVar": [ + {"DATACOMMLOC": "https://raw.githubusercontent.com/google/datacommons/master/schema"}, + {"SCHEMAORGLOC": "https://raw.githubusercontent.com/schemaorg/schemaorg/v3.4-release"} + ], + "dataFeedElement": [ + { + "@type": "DataDownload", + "fileContent": "docs", + "contentLocation": "[[DATACOMMLOC]]/docs", + "contentFile": [ + "favicon.ico", + "prettify.css", + "prettify.js", + "schemaorg.css" + ] + }, + { + "@type": "DataDownload", + "fileContent": "terms", + "contentLocation": "[[DATACOMMLOC]]", + "contentFile": "datacommons.rdfa" + }, + { + "@type": "DataDownload", + "fileContent": "examples", + "contentLocation": "[[SCHEMAORGLOC]]", + "contentFile": "examples.txt" + }, + { + "@type": "DataDownload", + "fileContent": "templates", + "contentLocation": "[[DATACOMMLOC]]/templates" + }, + { + "@type": "DataDownload", + "fileContent": "terms", + "addPrefix": "schema", + "addVocaburl": "http://schema.org/", + "contentFile": [ + "[[SCHEMAORGLOC]]/data/schema.rdfa", + "[[SCHEMAORGLOC]]/data/ext/meta/meta.rdfa" + ] + } + ] +} \ No newline at end of file diff --git a/schema/datacommons.rdfa b/schema/datacommons.rdfa index 75f92249..863c5186 100644 --- a/schema/datacommons.rdfa +++ b/schema/datacommons.rdfa @@ -115,8 +115,8 @@
CriminalActivities - Activities or behaviors considered crimes by the FBI. This type is used primarily for computing StatisticalPopulations used in querying crime statistics. - + Activities or behaviors considered crimes by the FBI. This type is used primarily for computing [[StatisticalPopulation]]s used in querying crime statistics. +
@@ -1367,5 +1367,78 @@ +
+ County + In the United States, a county is a political and geographic subdivision of a state. In some cases, counties are coterminous with localities. + +
+
+ area + The land area of a AdministrativeArea. + + + +
+
+ timezone + The timezone for the location. + + +
+ +
+ dcid + The unique identifier for the entity in dataCommons. + + +
+
+ freebaseId + The unique identifier for the entity in Freebase. + + +
+
+ geonamesId + The unique identifier for the entity in GeoNames. + + +
+
+ wikidataId + The unique identifier for the entity in Wikidata. + + +
+
+ gnisId + The Geographic Names Information System (GNIS) is the Federal and national + standard for geographic nomenclature, issued by the U.S. Geological Survey. +
+ Example: "1779778" +
Reference: GNIS.
+ + +
+
+ fipsId + + Federal Information Processing Series, issued by the American National + Standards Institute codes (ANSI codes). These are standardized numeric or + alphabetic codes to ensure uniform identification of geographic entities + through all federal government agencies.
+ Example: "06" for California.
+ Reference: ANSI. +
+ + +
+
+ provenance + The source for the data. Provenance is labeled per triple. + + +
+ diff --git a/schema/datacomschema.yaml b/schema/datacomschema.yaml new file mode 100644 index 00000000..cf25d80f --- /dev/null +++ b/schema/datacomschema.yaml @@ -0,0 +1,216 @@ +# application: webschemas-g +# usage: gcloud app deploy webschemas.yaml --project webschemas-g + +runtime: python27 +api_version: 1 +threadsafe: true + + +automatic_scaling: #Only applicable for appengine accounts with billing enabled + min_idle_instances: 1 + +instance_class: F2 + +inbound_services: +- warmup + +env_variables: + TARGETSITE: 'datacommons.org' + PRODSITEDEBUG: 'False' + CONFIGFILE: 'sdoconfig.json' + MOREBLOCK: 'False' + WARMUPSTATE: 'Auto' # 'Off', 'On', 'Auto' - Off for localhost, On elsewhere + STAYINEXTENTION: 'False' + PAGESTOREMODE: 'CLOUDSTORE' # 'INMEM' (In instance memory), 'NDBSHARED' (NDB shared - accross instances), 'CLOUDSTORE' (Cloudstorage files) + EXAMPLESTOREMODE: 'INMEM' # 'INMEM', 'NDBSHARED' + TIMESTAMPSTOREMODE: 'CLOUDSTORE' # 'INMEM', 'NDBSHARED', 'CLOUDSTORE' + CACHE_CONTROL: 'public, max-age=600' +# CACHE_CONTROL: 'no-cache' + +handlers: + +- url: /.*favicon.ico + static_files: docs/favicon.ico + upload: docs/favicon.ico + mime_type: image/x-icon + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /robots.txt + static_files: docs/robots.txt + upload: docs/robots.txt + secure: always + redirect_http_response_code: 301 + mime_type: text/plain + +- url: /docs/schemaorg.owl + static_files: docs/schemaorg.owl + upload: docs/schemaorg.owl + mime_type: application/rdf+xml + secure: always + redirect_http_response_code: 301 + +- url: /docs/schema_org_rdfa.html + static_files: data/schema.rdfa + upload: data/schema.rdfa + application_readable: True + mime_type: text/html + secure: always + redirect_http_response_code: 301 + +- url: /docs/jsonldcontext.json.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/full.*.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/schemas.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/developers.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/tree.json.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs + static_dir: docs + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /.*docs/* + static_dir: docs + application_readable: True + secure: always + redirect_http_response_code: 301 + +#- url: / +# static_files: static/index.html +# upload: static/index.html +# application_readable: True + +- url: /admin/refresh + login: required + script: sdoapp.app + +- url: /admin + static_dir: admin + application_readable: True + +- url: /search_files + static_dir: static/search_files + secure: always + redirect_http_response_code: 301 + +- url: /version/build-latest/.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/latest/.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /(version/[^/]*/)$ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /(version/([^/]*))$ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.rdfa) + mime_type: text/html + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.rdfa) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.ttl) + mime_type: application/x-turtle + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.ttl) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.jsonld) + mime_type: application/ld+json + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.jsonld) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.rdf) + mime_type: application/rdf+xml + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.rdf) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.nt) + mime_type: application/n-triples + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.nt) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.nq) + mime_type: application/n-quads + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.nq) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.csv) + mime_type: text/csv + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.csv) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/*/* + static_dir: data/releases/ + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + + + + +libraries: +- name: webapp2 + version: 2.5.2 +- name: jinja2 + version: 2.6 diff --git a/schema/deployment.md b/schema/deployment.md new file mode 100644 index 00000000..a78a2c97 --- /dev/null +++ b/schema/deployment.md @@ -0,0 +1,31 @@ +**[Temporary] Schema/Datacommons Deployment instructions** + +These assume that the version of SDOAPP in use is in schemaorg branch ‘**vocabindi2**’ , and the data files for datacommons/schema are in repository ‘**https://github.com/RichardWallis/datacommons/tree/appupdate**’. + +1. Checkout SDOAPP - branch vocabindi2 + git clone https://github.com/schemaorg/schemaorg.git + git checkout vocabindi2 + +2. Get Datacommons deployment files + curl -O https://raw.githubusercontent.com/RichardWallis/datacommons/appupdate/schema/datacomschema.yaml + curl -O https://raw.githubusercontent.com/RichardWallis/datacommons/appupdate/schema/test-datacomschema.yaml + +3. Deploy to an app engine: + Note: change project name from sdo-rjwtest to appropriate one. + Note Change —version=2 to one that is not used in target project + + If using files in Richard/Wallis/datacommons repo branch appupdate: + scripts/appdeploy.sh --no-promote --project sdo-rjwtest --version=2 test-datacomschema.yaml + + When appupdate branch has been merged in to datacommons: + scripts/appdeploy.sh --no-promote --project sdo-rjwtest --version=2 datacomschema.yaml + +4. Migrate traffic to version 2 (or whichever version label is chosen) + +5. To see which configuration files are in use: + * In console/App Engine/ Versions select Config>View for the version + * Scroll to env_variables + * Copy value of CONFIGFILE + * Paste into browser to view + * DATACOMMLOC value is root location for Datacommons config files + * SCHEMAORGLOC value is root location of Schema.org config files referenced in displays \ No newline at end of file diff --git a/schema/docs/favicon.ico b/schema/docs/favicon.ico new file mode 100644 index 00000000..4cc418dd Binary files /dev/null and b/schema/docs/favicon.ico differ diff --git a/schema/docs/prettify.css b/schema/docs/prettify.css new file mode 100644 index 00000000..72569cfa --- /dev/null +++ b/schema/docs/prettify.css @@ -0,0 +1,52 @@ +/* Pretty printing styles. Used with prettify.js. */ + +.str { color: #080; } +.kwd { color: #008; } +.com { color: #800; } +.typ { color: #606; } +.lit { color: #066; } +.pun { color: #660; } +.pln { color: #000; } +.tag { color: #008; } +.atn { color: #900; } +.atv { color: #080; } +.dec { color: #606; } +pre.prettyprint { + padding: 5px; + border: 1px solid #CCC; + background: #EFEFEF; +} +pre.prettyprint ol li:hover { background: #DFDFDF; } + +/* Specify class=linenums on a pre to get line numbering */ +ol.linenums { + margin: 0; /* IE indents via margin-left */ + padding:0; list-style-type: none; +} +li.L0, +li.L1, +li.L2, +li.L3, +li.L5, +li.L6, +li.L7, +li.L8 { list-style-type: none } +/* Alternate shading for lines */ +li.L1, +li.L3, +li.L5, +li.L7, +li.L9 { /* background: #efefef; using hover instead */ } + +@media print { + .str { color: #060; } + .kwd { color: #006; font-weight: bold; } + .com { color: #600; font-style: italic; } + .typ { color: #404; font-weight: bold; } + .lit { color: #044; } + .pun { color: #440; } + .pln { color: #000; } + .tag { color: #006; font-weight: bold; } + .atn { color: #404; } + .atv { color: #060; } +} \ No newline at end of file diff --git a/schema/docs/prettify.js b/schema/docs/prettify.js new file mode 100644 index 00000000..c9161da9 --- /dev/null +++ b/schema/docs/prettify.js @@ -0,0 +1,33 @@ +window.PR_SHOULD_USE_CONTINUATION=true;window.PR_TAB_WIDTH=8;window.PR_normalizedHtml=window.PR=window.prettyPrintOne=window.prettyPrint=void 0;window._pr_isIE6=function(){var y=navigator&&navigator.userAgent&&navigator.userAgent.match(/\bMSIE ([678])\./);y=y?+y[1]:false;window._pr_isIE6=function(){return y};return y}; +(function(){function y(b){return b.replace(L,"&").replace(M,"<").replace(N,">")}function H(b,f,i){switch(b.nodeType){case 1:var o=b.tagName.toLowerCase();f.push("<",o);var l=b.attributes,n=l.length;if(n){if(i){for(var r=[],j=n;--j>=0;)r[j]=l[j];r.sort(function(q,m){return q.name"); +for(l=b.firstChild;l;l=l.nextSibling)H(l,f,i);if(b.firstChild||!/^(?:br|link|img)$/.test(o))f.push("");break;case 3:case 4:f.push(y(b.nodeValue));break}}function O(b){function f(c){if(c.charAt(0)!=="\\")return c.charCodeAt(0);switch(c.charAt(1)){case "b":return 8;case "t":return 9;case "n":return 10;case "v":return 11;case "f":return 12;case "r":return 13;case "u":case "x":return parseInt(c.substring(2),16)||c.charCodeAt(1);case "0":case "1":case "2":case "3":case "4":case "5":case "6":case "7":return parseInt(c.substring(1), +8);default:return c.charCodeAt(1)}}function i(c){if(c<32)return(c<16?"\\x0":"\\x")+c.toString(16);c=String.fromCharCode(c);if(c==="\\"||c==="-"||c==="["||c==="]")c="\\"+c;return c}function o(c){var d=c.substring(1,c.length-1).match(RegExp("\\\\u[0-9A-Fa-f]{4}|\\\\x[0-9A-Fa-f]{2}|\\\\[0-3][0-7]{0,2}|\\\\[0-7]{1,2}|\\\\[\\s\\S]|-|[^-\\\\]","g"));c=[];for(var a=[],k=d[0]==="^",e=k?1:0,h=d.length;e122)){s<65||g>90||a.push([Math.max(65,g)|32,Math.min(s,90)|32]);s<97||g>122||a.push([Math.max(97,g)&-33,Math.min(s,122)&-33])}}a.sort(function(v,w){return v[0]-w[0]||w[1]-v[1]});d=[];g=[NaN,NaN];for(e=0;eh[0]){h[1]+1>h[0]&&a.push("-"); +a.push(i(h[1]))}}a.push("]");return a.join("")}function l(c){for(var d=c.source.match(RegExp("(?:\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]|\\\\u[A-Fa-f0-9]{4}|\\\\x[A-Fa-f0-9]{2}|\\\\[0-9]+|\\\\[^ux0-9]|\\(\\?[:!=]|[\\(\\)\\^]|[^\\x5B\\x5C\\(\\)\\^]+)","g")),a=d.length,k=[],e=0,h=0;e=2&&c==="[")d[e]=o(g);else if(c!=="\\")d[e]=g.replace(/[a-zA-Z]/g,function(s){s=s.charCodeAt(0);return"["+String.fromCharCode(s&-33,s|32)+"]"})}return d.join("")}for(var n=0,r=false,j=false,q=0,m=b.length;q=0;l-=16)o.push(" ".substring(0,l));l=n+1;break;case "\n":f=0;break;default:++f}if(!o)return i;o.push(i.substring(l));return o.join("")}}function I(b, +f,i,o){if(f){b={source:f,c:b};i(b);o.push.apply(o,b.d)}}function B(b,f){var i={},o;(function(){for(var r=b.concat(f),j=[],q={},m=0,t=r.length;m=0;)i[c.charAt(d)]=p;p=p[1];c=""+p;if(!q.hasOwnProperty(c)){j.push(p);q[c]=null}}j.push(/[\0-\uffff]/);o=O(j)})();var l=f.length;function n(r){for(var j=r.c,q=[j,z],m=0,t=r.source.match(o)||[],p={},c=0,d=t.length;c=5&&"lang-"===k.substring(0,5))&&!(e&&typeof e[1]==="string")){h=false;k=P}h||(p[a]=k)}g=m;m+=a.length;if(h){h=e[1];var s=a.indexOf(h),v=s+h.length;if(e[2]){v=a.length-e[2].length;s=v-h.length}k=k.substring(5);I(j+g,a.substring(0,s),n,q);I(j+g+s,h,Q(k,h),q);I(j+g+v,a.substring(v),n,q)}else q.push(j+g,k)}r.d=q}return n}function x(b){var f=[],i=[];if(b.tripleQuotedStrings)f.push([A,/^(?:\'\'\'(?:[^\'\\]|\\[\s\S]|\'{1,2}(?=[^\']))*(?:\'\'\'|$)|\"\"\"(?:[^\"\\]|\\[\s\S]|\"{1,2}(?=[^\"]))*(?:\"\"\"|$)|\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$))/, +null,"'\""]);else b.multiLineStrings?f.push([A,/^(?:\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$)|\`(?:[^\\\`]|\\[\s\S])*(?:\`|$))/,null,"'\"`"]):f.push([A,/^(?:\'(?:[^\\\'\r\n]|\\.)*(?:\'|$)|\"(?:[^\\\"\r\n]|\\.)*(?:\"|$))/,null,"\"'"]);b.verbatimStrings&&i.push([A,/^@\"(?:[^\"]|\"\")*(?:\"|$)/,null]);if(b.hashComments)if(b.cStyleComments){f.push([C,/^#(?:(?:define|elif|else|endif|error|ifdef|include|ifndef|line|pragma|undef|warning)\b|[^\r\n]*)/,null,"#"]);i.push([A,/^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h|[a-z]\w*)>/, +null])}else f.push([C,/^#[^\r\n]*/,null,"#"]);if(b.cStyleComments){i.push([C,/^\/\/[^\r\n]*/,null]);i.push([C,/^\/\*[\s\S]*?(?:\*\/|$)/,null])}b.regexLiterals&&i.push(["lang-regex",RegExp("^"+Z+"(/(?=[^/*])(?:[^/\\x5B\\x5C]|\\x5C[\\s\\S]|\\x5B(?:[^\\x5C\\x5D]|\\x5C[\\s\\S])*(?:\\x5D|$))+/)")]);b=b.keywords.replace(/^\s+|\s+$/g,"");b.length&&i.push([R,RegExp("^(?:"+b.replace(/\s+/g,"|")+")\\b"),null]);f.push([z,/^\s+/,null," \r\n\t\u00a0"]);i.push([J,/^@[a-z_$][a-z_$@0-9]*/i,null],[S,/^@?[A-Z]+[a-z][A-Za-z_$@0-9]*/, +null],[z,/^[a-z_$][a-z_$@0-9]*/i,null],[J,/^(?:0x[a-f0-9]+|(?:\d(?:_\d+)*\d*(?:\.\d*)?|\.\d\+)(?:e[+\-]?\d+)?)[a-z]*/i,null,"0123456789"],[E,/^.[^\s\w\.$@\'\"\`\/\#]*/,null]);return B(f,i)}function $(b){function f(D){if(D>r){if(j&&j!==q){n.push("");j=null}if(!j&&q){j=q;n.push('')}var T=y(p(i.substring(r,D))).replace(e?d:c,"$1 ");e=k.test(T);n.push(T.replace(a,s));r=D}}var i=b.source,o=b.g,l=b.d,n=[],r=0,j=null,q=null,m=0,t=0,p=Y(window.PR_TAB_WIDTH),c=/([\r\n ]) /g, +d=/(^| ) /gm,a=/\r\n?|\n/g,k=/[ \r\n]$/,e=true,h=window._pr_isIE6();h=h?b.b.tagName==="PRE"?h===6?" \r\n":h===7?" 
\r":" \r":" 
":"
";var g=b.b.className.match(/\blinenums\b(?::(\d+))?/),s;if(g){for(var v=[],w=0;w<10;++w)v[w]=h+'
  • ';var F=g[1]&&g[1].length?g[1]-1:0;n.push('
    1. ");s=function(){var D=v[++F%10];return j?""+D+'':D}}else s=h; +for(;;)if(m");j=null}n.push(o[m+1]);m+=2}else if(t");g&&n.push("
    ");b.a=n.join("")}function u(b,f){for(var i=f.length;--i>=0;){var o=f[i];if(G.hasOwnProperty(o))"console"in window&&console.warn("cannot override language handler %s",o);else G[o]=b}}function Q(b,f){b&&G.hasOwnProperty(b)||(b=/^\s*1&&m.charAt(0)==="<"){if(!ba.test(m))if(ca.test(m)){f.push(m.substring(9,m.length-3));n+=m.length-12}else if(da.test(m)){f.push("\n");++n}else if(m.indexOf(V)>=0&&m.replace(/\s(\w+)\s*=\s*(?:\"([^\"]*)\"|'([^\']*)'|(\S+))/g,' $1="$2$3$4"').match(/[cC][lL][aA][sS][sS]=\"[^\"]*\bnocode\b/)){var t=m.match(W)[2],p=1,c;c=j+1;a:for(;c=0;){var e=p.indexOf(";",k);if(e>=0){var h=p.substring(k+3,e),g=10;if(h&&h.charAt(0)==="x"){h=h.substring(1);g=16}var s=parseInt(h,g);isNaN(s)||(p=p.substring(0,k)+String.fromCharCode(s)+p.substring(e+1))}}a=p.replace(ea,"<").replace(fa,">").replace(ga,"'").replace(ha,'"').replace(ia," ").replace(ja, +"&")}f.push(a);n+=a.length}}o={source:f.join(""),h:r};var v=o.source;b.source=v;b.c=0;b.g=o.h;Q(i,v)(b);$(b)}catch(w){if("console"in window)console.log(w&&w.stack?w.stack:w)}}var A="str",R="kwd",C="com",S="typ",J="lit",E="pun",z="pln",P="src",V="nocode",Z=function(){for(var b=["!","!=","!==","#","%","%=","&","&&","&&=","&=","(","*","*=","+=",",","-=","->","/","/=",":","::",";","<","<<","<<=","<=","=","==","===",">",">=",">>",">>=",">>>",">>>=","?","@","[","^","^=","^^","^^=","{","|","|=","||","||=", +"~","break","case","continue","delete","do","else","finally","instanceof","return","throw","try","typeof"],f="(?:^^|[+-]",i=0;i:&a-z])/g,"\\$1");f+=")\\s*";return f}(),L=/&/g,M=//g,X=/\"/g,ea=/</g,fa=/>/g,ga=/'/g,ha=/"/g,ja=/&/g,ia=/ /g,ka=/[\r\n]/g,K=null,aa=RegExp("[^<]+| - - -{%- endmacro %} - - - - - -{%- macro debugInfo() -%} - -{% if not debugging %}
    {%- endif %} - -
    -
      -
    • SCHEMA_VERSION: {{ SCHEMA_VERSION }}
    • -
    • ENABLE_HOSTED_EXTENSIONS: {{ ENABLE_HOSTED_EXTENSIONS }}
    • -
    • host_ext: {{ host_ext }}
    • -
    • myhost: {{ myhost }}
    • -
    • myport: {{ myport }}
    • -
    • mybasehost: {{ mybasehost }}
    • -
    • debugging: {{ debugging }}
    • -
    • AppEngine Version: {{ appengineVersion }}
    • -
    -
    - -{% if not debugging %}
    {%- endif %} - - - -{%- endmacro %} diff --git a/schema/templates/full.tpl b/schema/templates/full.tpl index 9a1d5366..76c77763 100644 --- a/schema/templates/full.tpl +++ b/schema/templates/full.tpl @@ -52,30 +52,14 @@ $(document).ready(function(){
    -
    Select vocabulary view:
    -
    - - - {% if ext_button != "" %} - - {% endif %} -
    -
    - +
    Vocabulary view:
    +
    + {{ full_thing_tree | safe }} +
    -
    -{{ thing_tree | safe }} -
    -
    -{{ full_thing_tree | safe }} -
    -{% if ext_button != "" %} -
    - {{ ext_thing_tree | safe }} -
    -{% endif %} -
    -{{ datatype_tree | safe }} +
    + {{ datatype_tree | safe }} +
    diff --git a/schema/templates/genericTermPageHeader.tpl b/schema/templates/genericTermPageHeader.tpl index c76aa2ae..fc08664b 100644 --- a/schema/templates/genericTermPageHeader.tpl +++ b/schema/templates/genericTermPageHeader.tpl @@ -63,7 +63,7 @@ {% include 'basicPageHeader.tpl' with context %} -
    +
    {{ ext_mappings | safe }} diff --git a/schema/templates/schemas.tpl b/schema/templates/schemas.tpl index 16315147..70ffcda4 100644 --- a/schema/templates/schemas.tpl +++ b/schema/templates/schemas.tpl @@ -4,8 +4,7 @@ {% include 'headtags.tpl' with context %} Schemas - schema.datacommons.org - + @@ -17,18 +16,16 @@

    Organization of Schemas

    The schemas are a set of 'types', each associated with a set of properties. The types are arranged in a hierarchy.
    -{{ counts | safe }}

    Browse the full hierarchy:
    Or you can jump directly to a commonly used type: diff --git a/schema/templates/siteDebug.tpl b/schema/templates/siteDebug.tpl deleted file mode 100644 index 8262434f..00000000 --- a/schema/templates/siteDebug.tpl +++ /dev/null @@ -1,22 +0,0 @@ - - - - {% include 'headtags.tpl' with context %} - - {{ sitename }} - - - - - - -{% include 'basicPageHeader.tpl' with context %} - -
    -

    Site Debug

    - - - - diff --git a/schema/templates/tocVersionPage.tpl b/schema/templates/tocVersionPage.tpl deleted file mode 100644 index ef28434c..00000000 --- a/schema/templates/tocVersionPage.tpl +++ /dev/null @@ -1,33 +0,0 @@ - - - - {% include 'headtags.tpl' with context %} - Schema.org - Full Releases - - - - - - -{% include 'basicPageHeader.tpl' with context %} - -
    - -

    Schema.org versions

    - -

    See the releases page for a longer and more detailed history of schema.org releases.

    - -

    The following snapshot(s) of schema.org releases are available:

    - -
      -{% for release in releases %} -
    • {{release}}
    • -{% endfor %} -
    - -

    Note that these snapshots currently contain only the schema.org core vocabulary. Information about extensions and older releases may be added later.

    - - -

    -
    diff --git a/schema/templates/wrongExt.tpl b/schema/templates/wrongExt.tpl deleted file mode 100644 index bfc7ac2a..00000000 --- a/schema/templates/wrongExt.tpl +++ /dev/null @@ -1,34 +0,0 @@ - - - - - {% include 'headtags.tpl' with context %} - {{target}} defined in '{{ targetext }}' extention - {{ sitename }} - - - - - -{% include 'basicPageHeader.tpl' with context %} - -
    -

    Schema.org Extensions

    - -

    - The term '{{ target }}' is not in the schema.org core vocabulary, but is defined in an extension: -

    - - -

    Note: extension terms can be used in schema.org markup in the normal manner; it is not necessary for markup publishers to indicate which extension a term is currently in. Terms may move between extensions over time (e.g. from pending to the core) without the need for the corresponding markup to change. -

    -
    - -
    - - - diff --git a/schema/test-datacomconfig.json b/schema/test-datacomconfig.json new file mode 100644 index 00000000..bf36ef14 --- /dev/null +++ b/schema/test-datacomconfig.json @@ -0,0 +1,55 @@ +{ + "@context": { + "@vocab": "http://configfiles.schema.org/" + }, + "@type": "DataFeed", + "name": "schema.dataCommons.org", + "prefix": "schemadc", + "siteurl": "https://schema.datacommons.org", + "vocaburl": "https://schema.datacommons.org/", + "atticurl": "https://attic.schema.datacommons.org/", + "dataFeedVar": [ + {"DATACOMMLOC": "https://raw.githubusercontent.com/RichardWallis/datacommons/appupdate/schema"}, + {"SCHEMAORGLOC": "https://raw.githubusercontent.com/schemaorg/schemaorg/v3.4-release"} + ], + "dataFeedElement": [ + { + "@type": "DataDownload", + "fileContent": "docs", + "contentLocation": "[[DATACOMMLOC]]/docs", + "contentFile": [ + "favicon.ico", + "prettify.css", + "prettify.js", + "schemaorg.css" + ] + }, + { + "@type": "DataDownload", + "fileContent": "terms", + "contentLocation": "[[DATACOMMLOC]]", + "contentFile": "datacommons.rdfa" + }, + { + "@type": "DataDownload", + "fileContent": "examples", + "contentLocation": "[[SCHEMAORGLOC]]", + "contentFile": "examples.txt" + }, + { + "@type": "DataDownload", + "fileContent": "templates", + "contentLocation": "[[DATACOMMLOC]]/templates" + }, + { + "@type": "DataDownload", + "fileContent": "terms", + "addPrefix": "schema", + "addVocaburl": "http://schema.org/", + "contentFile": [ + "[[SCHEMAORGLOC]]/data/schema.rdfa", + "[[SCHEMAORGLOC]]/data/ext/meta/meta.rdfa" + ] + } + ] +} \ No newline at end of file diff --git a/schema/test-datacomschema.yaml b/schema/test-datacomschema.yaml new file mode 100644 index 00000000..226515e5 --- /dev/null +++ b/schema/test-datacomschema.yaml @@ -0,0 +1,215 @@ +# application: webschemas-g +# usage: gcloud app deploy webschemas.yaml --project webschemas-g + +runtime: python27 +api_version: 1 +threadsafe: true + + +#automatic_scaling: +# min_idle_instances: 2 + + +inbound_services: +- warmup + +env_variables: + TARGETSITE: 'datacommons.org' + PRODSITEDEBUG: 'False' + CONFIGFILE: 'https://raw.githubusercontent.com/RichardWallis/datacommons/appupdate/schema/test-datacomconfig.json' + MOREBLOCK: 'False' + WARMUPSTATE: 'Auto' # 'Off', 'On', 'Auto' - Off for localhost, On elsewhere + STAYINEXTENTION: 'False' + PAGESTOREMODE: 'CLOUDSTORE' # 'INMEM' (In instance memory), 'NDBSHARED' (NDB shared - accross instances), 'CLOUDSTORE' (Cloudstorage files) + EXAMPLESTOREMODE: 'INMEM' # 'INMEM', 'NDBSHARED' + TIMESTAMPSTOREMODE: 'CLOUDSTORE' # 'INMEM', 'NDBSHARED', 'CLOUDSTORE' +# CACHE_CONTROL: 'public, max-age=600' + CACHE_CONTROL: 'no-cache' + +handlers: + +- url: /.*favicon.ico + static_files: docs/favicon.ico + upload: docs/favicon.ico + mime_type: image/x-icon + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /robots.txt + static_files: docs/robots.txt + upload: docs/robots.txt + secure: always + redirect_http_response_code: 301 + mime_type: text/plain + +- url: /docs/schemaorg.owl + static_files: docs/schemaorg.owl + upload: docs/schemaorg.owl + mime_type: application/rdf+xml + secure: always + redirect_http_response_code: 301 + +- url: /docs/schema_org_rdfa.html + static_files: data/schema.rdfa + upload: data/schema.rdfa + application_readable: True + mime_type: text/html + secure: always + redirect_http_response_code: 301 + +- url: /docs/jsonldcontext.json.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/full.*.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/schemas.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/developers.html + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs/tree.json.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /docs + static_dir: docs + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /.*docs/* + static_dir: docs + application_readable: True + secure: always + redirect_http_response_code: 301 + +#- url: / +# static_files: static/index.html +# upload: static/index.html +# application_readable: True + +- url: /admin/refresh + login: required + script: sdoapp.app + +- url: /admin + static_dir: admin + application_readable: True + +- url: /search_files + static_dir: static/search_files + secure: always + redirect_http_response_code: 301 + +- url: /version/build-latest/.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/latest/.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /(version/[^/]*/)$ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /(version/([^/]*))$ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/ + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.rdfa) + mime_type: text/html + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.rdfa) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.ttl) + mime_type: application/x-turtle + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.ttl) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.jsonld) + mime_type: application/ld+json + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.jsonld) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.rdf) + mime_type: application/rdf+xml + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.rdf) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.nt) + mime_type: application/n-triples + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.nt) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.nq) + mime_type: application/n-quads + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.nq) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/(.*/.*\.csv) + mime_type: text/csv + static_files: data/releases/\1 + upload: data/releases/(.*/.*\.csv) + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /version/*/* + static_dir: data/releases/ + application_readable: True + secure: always + redirect_http_response_code: 301 + +- url: /.* + script: sdoapp.app + secure: always + redirect_http_response_code: 301 + + + + +libraries: +- name: webapp2 + version: 2.5.2 +- name: jinja2 + version: 2.6