ncbo · caufieldjh · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -112,6 +112,7 @@ pipeline {
 					            string(credentialsId: 'aws_kg_hub_secret_key', variable: 'AWS_SECRET_ACCESS_KEY')]) {
 
                                 // Index, then upload
+                                // This will include the log files (onto_stats.yaml and total_stats.yaml)
                                 sh '. venv/bin/activate && multi_indexer -v --directory data/transformed/ --prefix https://kghub.io/$S3PROJECTDIR/ -x -u'
                                 sh '. venv/bin/activate && s3cmd -c $S3CMD_CFG put -pr --acl-public --cf-invalidate data/transformed/ s3://kg-hub-public-data/$S3PROJECTDIR/'
 

diff --git a/docs/_config.yml b/docs/_config.yml
@@ -21,7 +21,7 @@
 title: KG-Bioportal
 email: [email protected]
 description: >- # this means to ignore newlines until "baseurl:"
-  BioPortal as a Knowledge Graph. 
+  BioPortal as Knowledge Graphs.
 baseurl: "/kg-bioportal" # the subpath of your site, e.g. /blog
 url: "https://ncbo.github.io" # the base hostname & protocol for your site, e.g. http://example.com
 
@@ -49,9 +49,6 @@ theme: jekyll-theme-tactile
 #   - vendor/ruby/
 
 # Ontology status table goes under here.
-# TODO: generate stats during each build using the scripts in BioPortal-to-KGX.
-#       Store the stats on KG-Hub as YAML.
-#       Retrieve them with the build_site.sh script.
 nodecount: 4861311
 edgecount: 30762617
 ontologies:

diff --git a/docs/_config_header.yml b/docs/_config_header.yml
@@ -21,7 +21,7 @@
 title: KG-Bioportal
 email: [email protected]
 description: >- # this means to ignore newlines until "baseurl:"
-  BioPortal as a Knowledge Graph. 
+  BioPortal as Knowledge Graphs.
 baseurl: "/kg-bioportal" # the subpath of your site, e.g. /blog
 url: "https://ncbo.github.io" # the base hostname & protocol for your site, e.g. http://example.com
 
@@ -49,6 +49,3 @@ theme: jekyll-theme-tactile
 #   - vendor/ruby/
 
 # Ontology status table goes under here.
-# TODO: generate stats during each build using the scripts in BioPortal-to-KGX.
-#       Store the stats on KG-Hub as YAML.
-#       Retrieve them with the build_site.sh script.
diff --git a/docs/about.markdown b/docs/about.markdown
@@ -6,12 +6,12 @@ permalink: /about/
 
 ## What is KG-Bioportal?
 
-KG-Bioportal is a version of the set of ontologies on BioPortal in which nearly all ontologies have been merged into a single knowledge graph. This means it is a collection of entities and relations, with the classes in each ontology serving as the entities and the connections between ontologies becoming relations. Where possible, entities and relations are categorized using Biolink Model, so entries in [NCBI Taxonomy](https://bioportal.bioontology.org/ontologies/NCBITAXON) are categorized as [biolink:OrganismTaxon](https://biolink.github.io/biolink-model/docs/OrganismTaxon.html), and so on.
+KG-Bioportal is a version of the set of ontologies on BioPortal in which ontologies have been transformed to graph nodes and edges in the [KGX format](https://github.com/biolink/kgx/blob/master/specification/kgx-format.md). This means it is a collection of entities and relations, with the classes in each ontology serving as the entities and the connections between ontologies becoming relations. Where possible, entities and relations are categorized using Biolink Model, so entries in [NCBI Taxonomy](https://bioportal.bioontology.org/ontologies/NCBITAXON) are categorized as [biolink:OrganismTaxon](https://biolink.github.io/biolink-model/docs/OrganismTaxon.html), and so on.
 
 ## How is it made?
 
-KG-Bioportal is made by careful transformation of each ontology from a dump of its 4store form to graph nodes and edges compatible with the KGX tools. The nodes and edges are then merged with KGX.
+KG-Bioportal is made by careful transformation of each ontology from the [Bioportal API](https://data.bioontology.org/). Ontology files from Bioportal are transformed to a common format before being converted to nodes and egdes.
 
 ## How is it useful?
 
-KG-Bioportal supports a holistic examination of a broad collection of hierarchical relationships in biology and biomedicine. Because all ontologies are contained within the same graph, they may be analysed by graph traversal and a growing collection of informative graph machine learning approaches.
+KG-Bioportal supports a holistic examination of a broad collection of hierarchical relationships in biology and biomedicine. Because all ontologies are contained a common format and data model, they may be merged in a modular fashion and analysed by graph traversal. This enables a growing collection of informative graph machine learning approaches.
diff --git a/docs/build_site.sh b/docs/build_site.sh
@@ -4,20 +4,20 @@
 # Define paths
 JEKYLL_CONFIG_HEADER_FILE="_config_header.yml"
 JEKYLL_CONFIG_FILE="_config.yml"
-GRAPH_STATS_URL="https://kg-hub.berkeleybop.io/kg-bioportal/graph_stats.yaml"
-GRAPH_STATS_FILE="graph_stats.yaml"
+TOTAL_STATS_URL="https://kg-hub.berkeleybop.io/kg-bioportal/total_stats.yaml"
+TOTAL_STATS_FILE="total_stats.yaml"
 ONTO_STATUS_URL="https://kg-hub.berkeleybop.io/kg-bioportal/onto_stats.yaml"
 ONTO_STATUS_FILE="onto_stats.yaml"
 
 # Retrieve most recent KG-Bioportal general stats
-wget -N $GRAPH_STATS_URL
+wget -N $TOTAL_STATS_URL
 
 # Retrieve most recent KG-Bioportal ontology status
 wget -N $ONTO_STATUS_URL
 
 # Append ontology status list
 echo "Adding all lists to Jekyll config."
-cat $JEKYLL_CONFIG_HEADER_FILE $GRAPH_STATS_FILE $ONTO_STATUS_FILE > $JEKYLL_CONFIG_FILE
+cat $JEKYLL_CONFIG_HEADER_FILE $TOTAL_STATS_FILE $ONTO_STATUS_FILE > $JEKYLL_CONFIG_FILE
 
 # Make figures
 echo "Producing figures."

diff --git a/docs/index.html b/docs/index.html
@@ -10,21 +10,18 @@ <h3>Learn more about KG-Bioportal</h3>
         <li><a href="https://ncbo.github.io/kg-bioportal/about">Learn more about KG-Bioportal</a></li>
         <li><a href="https://bioportal.bioontology.org/">Visit BioPortal</a></li>
         <li><a href="https://github.com/ncbo/kg-bioportal">See the KG-Bioportal GitHub repository</a></li>
-        <li><a href="https://github.com/ncbo/BioPortal-to-KGX">See the repository for translation from Bioportal to graph nodes and edges </a></li>
         <br>
 
       <h3>KG-Bioportal Overview</h3>
-      <p>The merged knowledge graph has the following properties: </p>
-      <p>Node Count: {{ site.nodecount }} </p>
-      <p>Edge Count: {{ site.edgecount }} </p>
+      <p>The collection includes graph files for {{ site.totalcount }} ontologies. </p>
 
       {% include fig1.html %}
       {% include fig2.html %}
       <br>
 
       <h3>KG-Bioportal Status</h3>
       <p>The table below lists the presence and status of each ontology in KG-Bioportal. 
-        Node and edge counts are pre-merged contributions from each ontology.</p>
+        Node and edge counts provided contributions for each ontology.</p>
     </div>
 
     <div class="col-md-12">

diff --git a/docs/make_viz.py b/docs/make_viz.py
@@ -4,7 +4,7 @@
 import plotly.express as px
 import yaml
 
-with open("onto_status.yaml", "r") as infile:
+with open("onto_stats.yaml", "r") as infile:
     ontos = pd.DataFrame(((yaml.safe_load(infile)))["ontologies"])
     countcols = ["nodecount", "edgecount"]
     ontos[countcols] = ontos[countcols].apply(pd.to_numeric, errors="coerce")
@@ -13,37 +13,49 @@
 # https://plotly.com/python/pie-charts/#pie-charts-in-subplots
 
 # Node counts across all ontologies, unmerged
-nodeontos = ontos.loc[ontos["nodecount"] < 150000, "id"] = "All other ontologies"
-fig1 = px.pie(
-    ontos,
-    values="nodecount",
-    names="id",
-    title="Nodes used to make KG-Bioportal",
-    hole=0.3,
+# Sort the DataFrame by nodecount in descending order and select the top 10
+top_ontos = ontos.nlargest(10, 'nodecount')
+
+fig1 = px.bar(
+    top_ontos,
+    x="id",
+    y="nodecount",
+    title="Top 10 Ontologies by Node Count in KG-Bioportal",
+    text="nodecount"
 )
-fig1.update_traces(textposition="inside", textinfo="percent+label")
+fig1.update_traces(texttemplate='%{text:.2s}', textposition='outside')
 fig1.update_layout(
     uniformtext_minsize=14,
     uniformtext_mode="hide",
     plot_bgcolor="rgba(0, 0, 0, 0)",
     paper_bgcolor="rgba(0, 0, 0, 0)",
+    xaxis_title="Ontology ID",
+    yaxis_title="Node Count",
+    xaxis=dict(categoryorder='total descending')
 )
+fig1.update_traces(marker=dict(color=px.colors.qualitative.Plotly))
 fig1.write_html("_includes/fig1.html", include_plotlyjs="cdn")
 
 # Edge counts across all ontologies, unmerged
-ontos.loc[ontos["edgecount"] < 150000, "id"] = "All other ontologies"
-fig2 = px.pie(
-    ontos,
-    values="edgecount",
-    names="id",
-    title="Edges used to make KG-Bioportal",
-    hole=0.3,
+# Sort the DataFrame by nodecount in descending order and select the top 10
+top_ontos = ontos.nlargest(10, 'edgecount')
+
+fig2 = px.bar(
+    top_ontos,
+    x="id",
+    y="edgecount",
+    title="Top 10 Ontologies by Edge Count in KG-Bioportal",
+    text="edgecount"
 )
-fig2.update_traces(textposition="inside", textinfo="percent+label")
+fig2.update_traces(texttemplate='%{text:.2s}', textposition='outside')
 fig2.update_layout(
     uniformtext_minsize=14,
     uniformtext_mode="hide",
     plot_bgcolor="rgba(0, 0, 0, 0)",
     paper_bgcolor="rgba(0, 0, 0, 0)",
+    xaxis_title="Ontology ID",
+    yaxis_title="Edge Count",
+    xaxis=dict(categoryorder='total descending')
 )
+fig2.update_traces(marker=dict(color=px.colors.qualitative.Plotly))
 fig2.write_html("_includes/fig2.html", include_plotlyjs="cdn")
diff --git a/src/kg_bioportal/cli.py b/src/kg_bioportal/cli.py
@@ -175,6 +175,10 @@ def download(
 def transform(input_dir, output_dir) -> None:
     """Transforms all ontologies in the input directory to KGX nodes and edges.
 
+    Yields two log files: total_stats.yaml and onto_stats.yaml.
+    The first contains the total counts of Bioportal ontologies and transforms.
+    The second contains the counts of nodes and edges for each ontology.
+
     Args:
         input_dir: A string pointing to the directory to import data from.
         output_dir: A string pointing to the directory to output data to.

diff --git a/src/kg_bioportal/transformer.py b/src/kg_bioportal/transformer.py
@@ -3,10 +3,13 @@
 import logging
 import os
 import sys
+from typing import Tuple
+
+import yaml
+from kgx.transformer import Transformer as KGXTransformer
 
 from kg_bioportal.downloader import ONTOLOGY_LIST_NAME
 from kg_bioportal.robot_utils import initialize_robot, robot_convert, robot_relax
-from kgx.transformer import Transformer as KGXTransformer
 
 # TODO: Don't repeat steps if the products already exist
 # TODO: Fix KGX hijacking logging
@@ -55,6 +58,10 @@ def __init__(
     def transform_all(self) -> None:
         """Transforms all ontologies in the input directory to KGX nodes and edges.
 
+        Yields two log files: total_stats.yaml and onto_stats.yaml.
+        The first contains the total counts of Bioportal ontologies and transforms.
+        The second contains the counts of nodes and edges for each ontology.
+
         Args:
             None.
 
@@ -66,6 +73,14 @@ def transform_all(self) -> None:
             f"Transforming all ontologies in {self.input_dir} to KGX nodes and edges."
         )
 
+        # This keeps track of the status of each transform.
+        # Ontology acronym IDs are keys.
+        # Values are dictionaries of:
+        # status: True if transform was successful, otherwise False.
+        # nodecount: Number of nodes in the ontology.
+        # edgecount: Number of edges in the ontology.
+        onto_log = {}
+
         filepaths = []
         for root, _dirs, files in os.walk(self.input_dir):
             for file in files:
@@ -79,23 +94,69 @@ def transform_all(self) -> None:
             logging.info(f"Found {len(filepaths)} ontologies to transform.")
 
         for filepath in filepaths:
-            if not self.transform(filepath):
+            ontology_name = (os.path.relpath(filepath, self.input_dir)).split(os.sep)[0]
+            success, nodecount, edgecount = self.transform(filepath)
+            if not success:
                 logging.error(f"Error transforming {filepath}.")
+                status = False
+                nodecount = 0
+                edgecount = 0
             else:
                 logging.info(f"Transformed {filepath}.")
+                status = True
+            if status == False:
+                strstatus = "Failed"
+            else:
+                strstatus = "OK"
+            onto_log[ontology_name] = {
+                "status": strstatus,
+                "nodecount": nodecount,
+                "edgecount": edgecount,
+            }
+
+        # Write total stats to a yaml
+        logging.info("Writing total stats to total_stats.yaml.")
+        # Get the count of successful transforms
+        success_count = 0
+        for onto in onto_log:
+            if onto_log[onto]["status"]:
+                success_count += 1
+        with open(os.path.join(self.output_dir, "total_stats.yaml"), "w") as f:
+            f.write("totalcount: " + str(success_count) + "\n")
+
+        # Dump onto_log to a yaml
+        # Rearrange it a bit first
+        logging.info("Writing ontology stats to onto_stats.yaml.")
+        onto_stats_list = []
+        for onto in onto_log:
+            onto_stats_list.append(
+                {
+                    "id": onto,
+                    "status": onto_log[onto]["status"],
+                    "nodecount": onto_log[onto]["nodecount"],
+                    "edgecount": onto_log[onto]["edgecount"],
+                }
+            )
+        with open(os.path.join(self.output_dir, "onto_stats.yaml"), "w") as of:
+            yaml.dump({"ontologies": onto_stats_list}, of)
 
         return None
 
-    def transform(self, ontology_path: str) -> bool:
+    def transform(self, ontology_path: str) -> Tuple[bool, int, int]:
         """Transforms a single ontology to KGX nodes and edges.
 
         Args:
             ontology: A string of the path to the ontology file to transform.
 
         Returns:
-            True if transform was successful, otherwise False.
+            Tuple of:
+                True if transform was successful, otherwise False.
+                Number of nodes in the ontology.
+                Number of edges in the ontology.
         """
         status = False
+        nodecount = 0
+        edgecount = 0
 
         ontology_name = (os.path.relpath(ontology_path, self.input_dir)).split(os.sep)[
             0
@@ -169,8 +230,19 @@ def transform(self, ontology_path: str) -> bool:
                 f"Nodes and edges written to {nodefilename} and {edgefilename}."
             )
             status = True
+
+            # Get length of nodefile
+            with open(nodefilename, "r") as f:
+                nodecount = len(f.readlines()) - 1
+
+            # Get length of edgefile
+            with open(edgefilename, "r") as f:
+                edgecount = len(f.readlines()) - 1
+
         except Exception as e:
-            logging.error(f"Error transforming {ontology_name} to KGX nodes and edges: {e}")
+            logging.error(
+                f"Error transforming {ontology_name} to KGX nodes and edges: {e}"
+            )
             status = False
 
-        return status
+        return status, nodecount, edgecount