Source code for skosify.skosify

# encoding=utf8

import sys
import time
import logging
import datetime

from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import Namespace, RDF, RDFS, OWL, DC, DCTERMS, XSD, SKOS
from .rdftools.namespace import SKOSEXT
from .rdftools import (
    read_rdf,
    replace_subject,
    replace_predicate,
    replace_object,
    replace_uri,
    delete_uri,
    localname
)

from .config import Config
from . import infer, check


def mapping_get(uri, mapping):
    """Look up the URI in the given mapping and return the result.

    Throws KeyError if no matching mapping was found.

    """
    ln = localname(uri)
    # 1. try to match URI keys
    for k, v in mapping.items():
        if k == uri:
            return v
    # 2. try to match local names
    for k, v in mapping.items():
        if k == ln:
            return v
    # 3. try to match local names with * prefix
    # try to match longest first, so sort the mapping by key length
    l = list(mapping.items())
    l.sort(key=lambda i: len(i[0]), reverse=True)
    for k, v in l:
        if k[0] == '*' and ln.endswith(k[1:]):
            return v
    raise KeyError(uri)


def mapping_match(uri, mapping):
    """Determine whether the given URI matches one of the given mappings.

    Returns True if a match was found, False otherwise.

    """
    try:
        val = mapping_get(uri, mapping)
        return True
    except KeyError:
        return False


def in_general_ns(uri):
    """Return True iff the URI is in a well-known general RDF namespace.

    URI namespaces considered well-known are RDF, RDFS, OWL, SKOS and DC."""
    RDFuri = RDF.uri
    RDFSuri = RDFS.uri

    for ns in (RDFuri, RDFSuri, OWL, SKOS, DC):
        if uri.startswith(str(ns)):
            return True
    return False


def get_concept_scheme(rdf):
    """Return a skos:ConceptScheme contained in the model.

    Returns None if no skos:ConceptScheme is present.
    """
    # add explicit type
    for s, o in sorted(rdf.subject_objects(SKOS.inScheme)):
        if not isinstance(o, Literal):
            rdf.add((o, RDF.type, SKOS.ConceptScheme))
        else:
            logging.warning(
                "Literal value %s for skos:inScheme detected, ignoring.", o)
    css = sorted(rdf.subjects(RDF.type, SKOS.ConceptScheme))
    cs = next(iter(css), None)
    if len(css) > 1:
        logging.warning(
            "Multiple concept schemes found. "
            "Selecting %s as default concept scheme.", cs)
    return cs


def detect_namespace(rdf):
    """Try to automatically detect the URI namespace of the vocabulary.

    Return namespace as URIRef.

    """

    # pick a concept
    conc = rdf.value(None, RDF.type, SKOS.Concept, any=True)
    if conc is None:
        logging.critical(
            "Namespace auto-detection failed. "
            "Set namespace using the --namespace option.")
        sys.exit(1)

    ln = localname(conc)
    ns = URIRef(conc.replace(ln, ''))
    if ns.strip() == '':
        logging.critical(
            "Namespace auto-detection failed. "
            "Set namespace using the --namespace option.")
        sys.exit(1)

    logging.info(
        "Namespace auto-detected to '%s' "
        "- you can override this with the --namespace option.", ns)
    return ns


def create_concept_scheme(rdf, ns, lname=''):
    """Create a skos:ConceptScheme in the model and return it."""

    ont = None
    if not ns:
        # see if there's an owl:Ontology and use that to determine namespace
        onts = sorted(rdf.subjects(RDF.type, OWL.Ontology))
        ont = next(iter(onts), None)

        if len(onts) > 1:
            logging.warning(
                "Multiple owl:Ontology instances found. "
                "Creating concept scheme from %s.", ont)

        if not ont:
            logging.info(
                "No skos:ConceptScheme or owl:Ontology found. "
                "Using namespace auto-detection for creating concept scheme.")
            ns = detect_namespace(rdf)
        elif ont[-1:] in ['/', '#', ':']:
            ns = ont
        else:
            ns = ont + '/'

    NS = Namespace(ns)
    cs = NS[lname]

    rdf.add((cs, RDF.type, SKOS.ConceptScheme))

    if ont is not None:
        rdf.remove((ont, RDF.type, OWL.Ontology))
        # remove owl:imports declarations
        for o in rdf.objects(ont, OWL.imports):
            rdf.remove((ont, OWL.imports, o))
        # remove protege specific properties
        for p, o in rdf.predicate_objects(ont):
            prot = URIRef(
                'http://protege.stanford.edu/plugins/owl/protege#')
            if p.startswith(str(prot)):
                rdf.remove((ont, p, o))
        # move remaining properties (dc:title etc.) of the owl:Ontology into
        # the skos:ConceptScheme
        replace_uri(rdf, ont, cs)

    return cs


def initialize_concept_scheme(rdf, cs, label, language, set_modified):
    """Initialize a concept scheme: Optionally add a label if the concept
    scheme doesn't have a label, and optionally add a dct:modified
    timestamp."""

    # check whether the concept scheme is unlabeled, and label it if possible
    labels = list(rdf.objects(cs, RDFS.label)) + \
        list(rdf.objects(cs, SKOS.prefLabel))
    if len(labels) == 0:
        if not label:
            logging.warning(
                "Concept scheme has no label(s). "
                "Use --label option to set the concept scheme label.")
        else:
            logging.info(
                "Unlabeled concept scheme detected. Setting label to '%s'" %
                label)
            rdf.add((cs, RDFS.label, Literal(label, language)))

    if set_modified:
        curdate = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + 'Z'
        rdf.remove((cs, DCTERMS.modified, None))
        rdf.add((cs, DCTERMS.modified, Literal(curdate, datatype=XSD.dateTime)))


def transform_sparql_update(rdf, update_query):
    """Perform a SPARQL Update transformation on the RDF data."""

    logging.debug("performing SPARQL Update transformation")

    if update_query[0] == '@':  # actual query should be read from file
        update_query = open(update_query[1:]).read()

    logging.debug("update query: %s", update_query)
    rdf.update(update_query)


def transform_sparql_construct(rdf, construct_query):
    """Perform a SPARQL CONSTRUCT query on the RDF data and return a new graph."""

    logging.debug("performing SPARQL CONSTRUCT transformation")

    if construct_query[0] == '@':  # actual query should be read from file
        construct_query = open(construct_query[1:]).read()

    logging.debug("CONSTRUCT query: %s", construct_query)

    newgraph = Graph()
    for triple in rdf.query(construct_query):
        newgraph.add(triple)

    return newgraph


def transform_concepts(rdf, typemap):
    """Transform Concepts into new types, as defined by the config file."""

    # find out all the types used in the model
    types = set()
    for s, o in rdf.subject_objects(RDF.type):
        if o not in typemap and in_general_ns(o):
            continue
        types.add(o)

    for t in sorted(types):
        if mapping_match(t, typemap):
            newval = mapping_get(t, typemap)
            newuris = [v[0] for v in newval]
            logging.debug("transform class %s -> %s", t, str(newuris))
            if newuris[0] is None:  # delete all instances
                for inst in rdf.subjects(RDF.type, t):
                    delete_uri(rdf, inst)
                delete_uri(rdf, t)
            else:
                replace_object(rdf, t, newuris, predicate=RDF.type)
        else:
            logging.info("Don't know what to do with type %s", t)


def transform_literals(rdf, literalmap):
    """Transform literal properties of Concepts, as defined by config file."""

    affected_types = (SKOS.Concept, SKOS.Collection,
                      SKOSEXT.DeprecatedConcept)

    props = set()
    for t in affected_types:
        for conc in rdf.subjects(RDF.type, t):
            for p, o in rdf.predicate_objects(conc):
                if isinstance(o, Literal) \
                   and (p in literalmap or not in_general_ns(p)):
                    props.add(p)

    for p in sorted(props):
        if mapping_match(p, literalmap):
            newval = mapping_get(p, literalmap)
            newuris = [v[0] for v in newval]
            logging.debug("transform literal %s -> %s", p, str(newuris))
            replace_predicate(
                rdf, p, newuris, subjecttypes=affected_types)
        else:
            logging.info("Don't know what to do with literal %s", p)


def transform_relations(rdf, relationmap):
    """Transform YSO-style concept relations into SKOS equivalents."""

    affected_types = (SKOS.Concept, SKOS.Collection,
                      SKOSEXT.DeprecatedConcept)

    props = set()
    for t in affected_types:
        for conc in rdf.subjects(RDF.type, t):
            for p, o in rdf.predicate_objects(conc):
                if isinstance(o, (URIRef, BNode)) \
                   and (p in relationmap or not in_general_ns(p)):
                    props.add(p)

    for p in sorted(props):
        if mapping_match(p, relationmap):
            newval = mapping_get(p, relationmap)
            logging.debug("transform relation %s -> %s", p, str(newval))
            replace_predicate(
                rdf, p, newval, subjecttypes=affected_types)
        else:
            logging.info("Don't know what to do with relation %s", p)


def transform_labels(rdf, defaultlanguage):
    # fix labels and documentary notes with extra whitespace
    for labelProp in (
            SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel,
            SKOSEXT.candidateLabel, SKOS.note, SKOS.scopeNote,
            SKOS.definition, SKOS.example, SKOS.historyNote,
            SKOS.editorialNote, SKOS.changeNote, RDFS.label):
        for conc, label in sorted(rdf.subject_objects(labelProp)):
            if not isinstance(label, Literal):
                continue
            # strip extra whitespace, if found
            if len(label.strip()) < len(label):
                logging.warning(
                    "Stripping whitespace from label of %s: '%s'", conc, label)
                newlabel = Literal(label.strip(), label.language)
                rdf.remove((conc, labelProp, label))
                rdf.add((conc, labelProp, newlabel))
                label = newlabel
            # set default language
            if defaultlanguage and label.language is None:
                logging.warning(
                    "Setting default language of '%s' to %s",
                    label, defaultlanguage)
                newlabel = Literal(label, defaultlanguage)
                rdf.remove((conc, labelProp, label))
                rdf.add((conc, labelProp, newlabel))

    # make skosext:candidateLabel either prefLabel or altLabel
    # make a set of (concept, language) tuples for concepts which have
    # candidateLabels in some language
    conc_lang = set([(c, l.language)
                     for c, l in rdf.subject_objects(SKOSEXT.candidateLabel)])
    for conc, lang in conc_lang:
        # check whether there are already prefLabels for this concept in this
        # language
        if lang not in [pl.language
                        for pl in rdf.objects(conc, SKOS.prefLabel)]:
            # no -> let's transform the candidate labels into prefLabels
            to_prop = SKOS.prefLabel
        else:
            # yes -> let's make them altLabels instead
            to_prop = SKOS.altLabel

        # do the actual transform from candidateLabel to prefLabel or altLabel
        for label in rdf.objects(conc, SKOSEXT.candidateLabel):
            if label.language != lang:
                continue
            rdf.remove((conc, SKOSEXT.candidateLabel, label))
            rdf.add((conc, to_prop, label))

    for conc, label in rdf.subject_objects(SKOSEXT.candidateLabel):
        rdf.remove((conc, SKOSEXT.candidateLabel, label))
        if label.language not in [pl.language
                                  for pl in rdf.objects(conc, SKOS.prefLabel)]:
            # no prefLabel found, make this candidateLabel a prefLabel
            rdf.add((conc, SKOS.prefLabel, label))
        else:
            # prefLabel found, make it an altLabel instead
            rdf.add((conc, SKOS.altLabel, label))


def transform_collections(rdf):
    for coll in sorted(rdf.subjects(RDF.type, SKOS.Collection)):
        for prop in (SKOS.broader, SKOSEXT.broaderGeneric):
            broaders = set(rdf.objects(coll, prop))
            narrowers = set(rdf.subjects(prop, coll))
            # remove the Collection from the hierarchy
            for b in broaders:
                rdf.remove((coll, prop, b))
            # replace the broader relationship with inverse skos:member
            for n in narrowers:
                rdf.remove((n, prop, coll))
                rdf.add((coll, SKOS.member, n))
                # add a direct broader relation to the broaders of the
                # collection
                for b in broaders:
                    rdf.add((n, prop, b))

        # avoid using SKOS semantic relations as they're only meant
        # to be used for concepts (i.e. have rdfs:domain skos:Concept)
        # FIXME should maybe use some substitute for exactMatch for
        # collections?
        for relProp in (SKOS.semanticRelation,
                        SKOS.broader, SKOS.narrower, SKOS.related,
                        SKOS.broaderTransitive, SKOS.narrowerTransitive,
                        SKOS.mappingRelation,
                        SKOS.closeMatch, SKOS.exactMatch,
                        SKOS.broadMatch, SKOS.narrowMatch, SKOS.relatedMatch,
                        SKOS.topConceptOf, SKOS.hasTopConcept):
            for o in sorted(rdf.objects(coll, relProp)):
                logging.warning(
                    "Removing concept relation %s -> %s from collection %s",
                    localname(relProp), o, coll)
                rdf.remove((coll, relProp, o))
            for s in sorted(rdf.subjects(relProp, coll)):
                logging.warning(
                    "Removing concept relation %s <- %s from collection %s",
                    localname(relProp), s, coll)
                rdf.remove((s, relProp, coll))


def transform_aggregate_concepts(rdf, cs, relationmap, aggregates):
    """Transform YSO-style AggregateConcepts into skos:Concepts within their
       own skos:ConceptScheme, linked to the regular concepts with
       SKOS.narrowMatch relationships. If aggregates is False, remove
       all aggregate concepts instead."""

    if not aggregates:
        logging.debug("removing aggregate concepts")

    aggregate_concepts = []

    relation = relationmap.get(
        OWL.equivalentClass, [(OWL.equivalentClass, False)])[0][0]
    for conc, eq in rdf.subject_objects(relation):
        eql = rdf.value(eq, OWL.unionOf, None)
        if eql is None:
            continue
        if aggregates:
            aggregate_concepts.append(conc)
            for item in rdf.items(eql):
                rdf.add((conc, SKOS.narrowMatch, item))
        # remove the old equivalentClass-unionOf-rdf:List structure
        rdf.remove((conc, relation, eq))
        rdf.remove((eq, RDF.type, OWL.Class))
        rdf.remove((eq, OWL.unionOf, eql))
        # remove the rdf:List structure
        delete_uri(rdf, eql)
        if not aggregates:
            delete_uri(rdf, conc)

    if len(aggregate_concepts) > 0:
        ns = cs.replace(localname(cs), '')
        acs = create_concept_scheme(rdf, ns, 'aggregateconceptscheme')
        logging.debug("creating aggregate concept scheme %s", acs)
        for conc in aggregate_concepts:
            rdf.add((conc, SKOS.inScheme, acs))


def transform_deprecated_concepts(rdf, cs):
    """Transform deprecated concepts so they are in their own concept
    scheme."""

    deprecated_concepts = []

    for conc in rdf.subjects(RDF.type, SKOSEXT.DeprecatedConcept):
        rdf.add((conc, RDF.type, SKOS.Concept))
        rdf.add((conc, OWL.deprecated, Literal("true", datatype=XSD.boolean)))
        deprecated_concepts.append(conc)

    if len(deprecated_concepts) > 0:
        ns = cs.replace(localname(cs), '')
        dcs = create_concept_scheme(
            rdf, ns, 'deprecatedconceptscheme')
        logging.debug("creating deprecated concept scheme %s", dcs)
        for conc in deprecated_concepts:
            rdf.add((conc, SKOS.inScheme, dcs))


# { ?a skos:broader ?b . ?b skos:broader => ?c }
# => { ?a skos:broaderTransitive ?b, ?c . ?b skos:broaderTransitive ?c }
def infer_broaderTransitive(rdf):
    for conc in rdf.subjects(RDF.type, SKOS.Concept):
        for bt in rdf.transitive_objects(conc, SKOS.broader):
            if bt == conc:
                continue
            rdf.add((conc, SKOS.broaderTransitive, bt))


# { ?a skos:broader ?b . ?b skos:broader => ?c }
# => { ?c skos:narrowerTransitive ?a, ?b . ?b skos:narrowerTransitive ?a }
def infer_narrowerTransitive(rdf):
    for conc in rdf.subjects(RDF.type, SKOS.Concept):
        for bt in rdf.transitive_objects(conc, SKOS.broader):
            if bt == conc:
                continue
            rdf.add((bt, SKOS.narrowerTransitive, conc))


# { ?a skos:broader ?b } <=> { ?b skos:narrower ?a }
def infer_broader_narrower(rdf):
    for s, o in rdf.subject_objects(SKOS.broader):
        rdf.add((o, SKOS.narrower, s))
    for s, o in rdf.subject_objects(SKOS.narrower):
        rdf.add((o, SKOS.broader, s))


def enrich_relations(rdf, enrich_mappings, use_narrower, use_transitive):
    """Enrich the SKOS relations according to SKOS semantics, including
    subproperties of broader and symmetric related properties. If use_narrower
    is True, include inverse narrower relations for all broader relations. If
    use_narrower is False, instead remove all narrower relations, replacing
    them with inverse broader relations. If use_transitive is True, calculate
    transitive hierarchical relationships.

    (broaderTransitive, and also narrowerTransitive if use_narrower is
    True) and include them in the model.

    """

    # 1. first enrich mapping relationships (because they affect regular ones)

    if enrich_mappings:
        infer.skos_symmetric_mappings(rdf)
        infer.skos_hierarchical_mappings(rdf, use_narrower)

    # 2. then enrich regular relationships

    # related <-> related
    infer.skos_related(rdf)

    # broaderGeneric -> broader + inverse narrowerGeneric
    for s, o in rdf.subject_objects(SKOSEXT.broaderGeneric):
        rdf.add((s, SKOS.broader, o))

    # broaderPartitive -> broader + inverse narrowerPartitive
    for s, o in rdf.subject_objects(SKOSEXT.broaderPartitive):
        rdf.add((s, SKOS.broader, o))

    infer.skos_hierarchical(rdf, use_narrower)

    # transitive closure: broaderTransitive and narrowerTransitive
    if use_transitive:
        infer.skos_transitive(rdf, use_narrower)
    else:
        # transitive relationships are not wanted, so remove them
        for s, o in rdf.subject_objects(SKOS.broaderTransitive):
            rdf.remove((s, SKOS.broaderTransitive, o))
        for s, o in rdf.subject_objects(SKOS.narrowerTransitive):
            rdf.remove((s, SKOS.narrowerTransitive, o))

    infer.skos_topConcept(rdf)


def setup_top_concepts(rdf, mark_top_concepts):
    """Determine the top concepts of each concept scheme and mark them using
    hasTopConcept/topConceptOf."""

    for cs in sorted(rdf.subjects(RDF.type, SKOS.ConceptScheme)):
        for conc in sorted(rdf.subjects(SKOS.inScheme, cs)):
            if (conc, RDF.type, SKOS.Concept) not in rdf:
                continue  # not a Concept, so can't be a top concept
            # check whether it's a top concept
            broader = rdf.value(conc, SKOS.broader, None, any=True)
            if broader is None:  # yes it is a top concept!
                if (cs, SKOS.hasTopConcept, conc) not in rdf and \
                   (conc, SKOS.topConceptOf, cs) not in rdf:
                    if mark_top_concepts:
                        logging.info(
                            "Marking loose concept %s "
                            "as top concept of scheme %s", conc, cs)
                        rdf.add((cs, SKOS.hasTopConcept, conc))
                        rdf.add((conc, SKOS.topConceptOf, cs))
                    else:
                        logging.debug(
                            "Not marking loose concept %s as top concept "
                            "of scheme %s, as mark_top_concepts is disabled",
                            conc, cs)


def setup_concept_scheme(rdf, defaultcs):
    """Make sure all concepts have an inScheme property, using the given
    default concept scheme if necessary."""
    for conc in rdf.subjects(RDF.type, SKOS.Concept):
        # check concept scheme
        cs = rdf.value(conc, SKOS.inScheme, None, any=True)
        if cs is None:  # need to set inScheme
            rdf.add((conc, SKOS.inScheme, defaultcs))


def cleanup_classes(rdf):
    """Remove unnecessary class definitions: definitions of SKOS classes or
       unused classes. If a class is also a skos:Concept or skos:Collection,
       remove the 'classness' of it but leave the Concept/Collection."""
    for t in (OWL.Class, RDFS.Class):
        for cl in sorted(rdf.subjects(RDF.type, t)):
            # SKOS classes may be safely removed
            if cl.startswith(str(SKOS)):
                logging.debug("removing SKOS class definition: %s", cl)
                replace_subject(rdf, cl, None)
                continue
            # if there are instances of the class, keep the class def
            if rdf.value(None, RDF.type, cl, any=True) is not None:
                continue
            # if the class is used in a domain/range/equivalentClass
            # definition, keep the class def
            if rdf.value(None, RDFS.domain, cl, any=True) is not None:
                continue
            if rdf.value(None, RDFS.range, cl, any=True) is not None:
                continue
            if rdf.value(None, OWL.equivalentClass, cl, any=True) is not None:
                continue

            # if the class is also a skos:Concept or skos:Collection, only
            # remove its rdf:type
            if (cl, RDF.type, SKOS.Concept) in rdf \
               or (cl, RDF.type, SKOS.Collection) in rdf:
                logging.debug("removing classiness of %s", cl)
                rdf.remove((cl, RDF.type, t))
            else:  # remove it completely
                logging.debug("removing unused class definition: %s", cl)
                replace_subject(rdf, cl, None)


def cleanup_properties(rdf):
    """Remove unnecessary property definitions.

    Removes SKOS and DC property definitions and definitions of unused
    properties."""
    for t in (RDF.Property, OWL.DatatypeProperty, OWL.ObjectProperty,
              OWL.SymmetricProperty, OWL.TransitiveProperty,
              OWL.InverseFunctionalProperty, OWL.FunctionalProperty):
        for prop in sorted(rdf.subjects(RDF.type, t)):
            if prop.startswith(str(SKOS)):
                logging.debug(
                    "removing SKOS property definition: %s", prop)
                replace_subject(rdf, prop, None)
                continue
            if prop.startswith(str(DC)):
                logging.debug("removing DC property definition: %s", prop)
                replace_subject(rdf, prop, None)
                continue

            # if there are triples using the property, keep the property def
            if len(list(rdf.subject_objects(prop))) > 0:
                continue

            logging.debug("removing unused property definition: %s", prop)
            replace_subject(rdf, prop, None)


def find_reachable(rdf, res):
    """Return the set of reachable resources starting from the given resource,
    excluding the seen set of resources.

    Note that the seen set is modified
    in-place to reflect the ongoing traversal.

    """

    starttime = time.time()

    # This is almost a non-recursive breadth-first search algorithm, but a set
    # is used as the "open" set instead of a FIFO, and an arbitrary element of
    # the set is searched. This is slightly faster than DFS (using a stack)
    # and much faster than BFS (using a FIFO).
    seen = set()			# used as the "closed" set
    to_search = set([res])  # used as the "open" set

    while len(to_search) > 0:
        res = to_search.pop()
        if res in seen:
            continue
        seen.add(res)
        # res as subject
        for p, o in rdf.predicate_objects(res):
            if isinstance(p, URIRef) and p not in seen:
                to_search.add(p)
            if isinstance(o, URIRef) and o not in seen:
                to_search.add(o)
        # res as predicate
        for s, o in rdf.subject_objects(res):
            if isinstance(s, URIRef) and s not in seen:
                to_search.add(s)
            if isinstance(o, URIRef) and o not in seen:
                to_search.add(o)
        # res as object
        for s, p in rdf.subject_predicates(res):
            if isinstance(s, URIRef) and s not in seen:
                to_search.add(s)
            if isinstance(p, URIRef) and p not in seen:
                to_search.add(p)

    endtime = time.time()
    logging.debug("find_reachable took %f seconds", (endtime - starttime))

    return seen


def cleanup_unreachable(rdf):
    """Remove triples which cannot be reached from the concepts by graph
    traversal."""

    all_subjects = set(rdf.subjects())

    logging.debug("total subject resources: %d", len(all_subjects))

    reachable = find_reachable(rdf, SKOS.Concept)
    nonreachable = all_subjects - reachable

    logging.debug("deleting %s non-reachable resources", len(nonreachable))

    for subj in nonreachable:
        delete_uri(rdf, subj)


def check_labels(rdf, preflabel_policy):
    """Check that resources have only one prefLabel per language (S14)
    and check overlap between disjoint label properties (S13)."""
    check.preflabel_uniqueness(rdf, preflabel_policy)
    check.label_overlap(rdf, True)


def check_hierarchy(rdf, break_cycles, keep_related, mark_top_concepts,
                    eliminate_redundancy):
    """Check for, and optionally fix, problems in the skos:broader hierarchy
    using a recursive depth first search algorithm.

    :param Graph rdf: An rdflib.graph.Graph object.
    :param bool fix_cycles: Break cycles.
    :param bool fix_disjoint_relations: Remoe skos:related overlapping with
        skos:broaderTransitive.
    :param bool fix_redundancy: Remove skos:broader between two concepts otherwise
        connected by skos:broaderTransitive.
    """
    starttime = time.time()

    if check.hierarchy_cycles(rdf, break_cycles):
        logging.info(
            "Some concepts not reached in initial cycle detection. "
            "Re-checking for loose concepts.")
        setup_top_concepts(rdf, mark_top_concepts)

    check.disjoint_relations(rdf, not keep_related)
    check.hierarchical_redundancy(rdf, eliminate_redundancy)

    endtime = time.time()
    logging.debug("check_hierarchy took %f seconds", (endtime - starttime))


[docs]def skosify(*sources, **config): """Convert, extend, and check SKOS vocabulary.""" cfg = Config() for key in config: if hasattr(cfg, key): setattr(cfg, key, config[key]) config = cfg namespaces = config.namespaces typemap = config.types literalmap = config.literals relationmap = config.relations logging.debug("Skosify starting. $Revision$") starttime = time.time() logging.debug("Phase 1: Parsing input files") try: voc = read_rdf(sources, config.from_format) except: logging.critical("Parsing failed. Exception: %s", str(sys.exc_info()[1])) sys.exit(1) inputtime = time.time() logging.debug("Phase 2: Performing inferences") if config.update_query is not None: transform_sparql_update(voc, config.update_query) if config.construct_query is not None: voc = transform_sparql_construct(voc, config.construct_query) if config.infer: logging.debug("doing RDFS subclass and properties inference") infer.rdfs_classes(voc) infer.rdfs_properties(voc) logging.debug("Phase 3: Setting up namespaces") for prefix, uri in namespaces.items(): voc.namespace_manager.bind(prefix, uri) logging.debug("Phase 4: Transforming concepts, literals and relations") # transform concepts, literals and concept relations transform_concepts(voc, typemap) transform_literals(voc, literalmap) transform_relations(voc, relationmap) # special transforms for labels: whitespace, prefLabel vs altLabel transform_labels(voc, config.default_language) # special transforms for collections + aggregate and deprecated concepts transform_collections(voc) # find/create concept scheme cs = get_concept_scheme(voc) if not cs: cs = create_concept_scheme(voc, config.namespace) initialize_concept_scheme(voc, cs, label=config.label, language=config.default_language, set_modified=config.set_modified) transform_aggregate_concepts( voc, cs, relationmap, config.aggregates) transform_deprecated_concepts(voc, cs) logging.debug("Phase 5: Performing SKOS enrichments") # enrichments: broader <-> narrower, related <-> related enrich_relations(voc, config.enrich_mappings, config.narrower, config.transitive) logging.debug("Phase 6: Cleaning up") # clean up unused/unnecessary class/property definitions and unreachable # triples if config.cleanup_properties: cleanup_properties(voc) if config.cleanup_classes: cleanup_classes(voc) if config.cleanup_unreachable: cleanup_unreachable(voc) logging.debug("Phase 7: Setting up concept schemes and top concepts") # setup inScheme and hasTopConcept setup_concept_scheme(voc, cs) setup_top_concepts(voc, config.mark_top_concepts) logging.debug("Phase 8: Checking concept hierarchy") # check hierarchy for cycles check_hierarchy(voc, config.break_cycles, config.keep_related, config.mark_top_concepts, config.eliminate_redundancy) logging.debug("Phase 9: Checking labels") # check for duplicate labels check_labels(voc, config.preflabel_policy) processtime = time.time() logging.debug("reading input file took %d seconds", (inputtime - starttime)) logging.debug("processing took %d seconds", (processtime - inputtime)) logging.debug("Phase 10: Writing output") return voc