Difference between revisions of "Phylotastic/TNRS"

From Evolutionary Interoperability and Outreach
Jump to: navigation, search
m
Line 19: Line 19:
 
====Returns====
 
====Returns====
 
{| class="wikitable"
 
{| class="wikitable"
! Section !! Field !! Meaning !! Examples
+
! Field !! Meaning !! Examples
 +
|-
 +
|message || Human readable message || "Your request is being processed. You can retrieve the results at http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15."
 +
|-
 +
|submit date || Date and time at which the request was submitted || "Mon Jun 11 20:25:16 2012"
 
|-
 
|-
 +
|token || Unique identifier assigned to the request (jobId) || "76ca0e9a3ab78e6bc5b4e362c8c40e15"
 
|-
 
|-
 +
|uri || Address at which the results can be retrieved ||  "http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15"
 
|}
 
|}
  
Line 30: Line 36:
  
 
  {
 
  {
     "message": "Your request is being processed. You can retrieve the results at http://api.phylotastic.org/tnrs/retrieve/9e342c53f54f12abfc6a204932843884.",  
+
     "message": "Your request is being processed. You can retrieve the results at http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15.",  
     "submit date": "Mon Jun 11 17:11:29 2012",  
+
     "submit date": "Mon Jun 11 20:25:16 2012",  
     "token": "9e342c53f54f12abfc6a204932843884",  
+
     "token": "76ca0e9a3ab78e6bc5b4e362c8c40e15",  
     "uri": "http://api.phylotastic.org/tnrs/retrieve/9e342c53f54f12abfc6a204932843884"
+
     "uri": "http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15"
  }  
+
  }
 
 
  
  
Line 42: Line 47:
  
 
====URI====
 
====URI====
<code><nowiki>http://api.phylotastic.org/tnrs/retrieve/<jobId></nowiki></code>
+
<code><nowiki>http://api.phylotastic.org/tnrs/retrieve/<token></nowiki></code>
  
 
====Parameters====
 
====Parameters====
Line 51: Line 56:
 
! Section !! Field !! Meaning !! Examples
 
! Section !! Field !! Meaning !! Examples
 
|-
 
|-
|metadata || jobId || The job-id which was submitted (for asynchronous requests) || 12345, "1-ABC-789"
+
|metadata || jobId || The job-id which was submitted (for asynchronous requests) || "76ca0e9a3ab78e6bc5b4e362c8c40e15"
 
|-
 
|-
|metadata || submitDate || Date on which this job was submitted in [http://en.wikipedia.org/wiki/ISO_8601 ISO 8601 format]. || "2012-06-06T14:54Z"
+
|metadata || submitDate || Date on which this job was submitted. || "Mon Jun 11 20:25:16 2012"
 
|-
 
|-
 
|metadata || sources ||colspan="2"| An array of all the sources available to our TNRS service, in the following format:
 
|metadata || sources ||colspan="2"| An array of all the sources available to our TNRS service, in the following format:
Line 101: Line 106:
 
====Example====
 
====Example====
  
GET http://api.phylotastic.org/tnrs/retrieve/9e342c53f54f12abfc6a204932843884
+
GET http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15
  
 
  {
 
  {
 
     "metadata": {
 
     "metadata": {
         "jobId": "9e342c53f54f12abfc6a204932843884",  
+
         "jobId": "76ca0e9a3ab78e6bc5b4e362c8c40e15",  
         "sources": 0,  
+
         "sources": [
         "submitDate": "Mon Jun 11 17:11:29 2012"
+
            {
 +
                "annotations": {},
 +
                "description": "NCBI Taxonomy",
 +
                "name": "NCBI",
 +
                "publication": "Federhen S. The Taxonomy Project.2002 Oct 9 [Updated 2003 Aug 13]. In: McEntyre J., Ostell J., editors. The NCBI Handbook [Internet]. Bethesda (MD): National Center for Biotechnology Information (US);2002.",
 +
                "rank": 3,
 +
                "sourceId": "NCBI",
 +
                "status": "200: OK",
 +
                "uri": "http://www.ncbi.nlm.nih.gov/taxonomy"
 +
            },
 +
            {
 +
                "annotations": {
 +
                    "Authority": "Author attributed to the accepted name (where applicable)."
 +
                },
 +
                "description": "The iPlant Collaborative TNRS provides parsing and fuzzy matching for plant taxa.",
 +
                "name": "iPlant Collaborative TNRS v3.0",
 +
                "publication": "The Taxonomic Name Resolution Service; http://tnrs.iplantcollaborative.org; version 3.0.",
 +
                "rank": 2,
 +
                "sourceId": "iPlant TNRS",
 +
                "status": "200: OK",
 +
                "uri": "http://tnrs.iplantcollaborative.org/"
 +
            }
 +
        ],  
 +
         "sub_date": "Mon Jun 11 20:25:16 2012"
 
     },  
 
     },  
 
     "names": [
 
     "names": [
 
         {
 
         {
             "matchCount": 3,  
+
             "matchCount": 1,  
 
             "matches": [
 
             "matches": [
                null,
 
                null,
 
 
                 {
 
                 {
 
                     "acceptedName": "Humbertia",  
 
                     "acceptedName": "Humbertia",  
Line 122: Line 148:
 
                     "matchedName": "Humbertia",  
 
                     "matchedName": "Humbertia",  
 
                     "score": "0.46973019780931",  
 
                     "score": "0.46973019780931",  
                     "sourceId": null,  
+
                     "sourceId": "iPlant TNRS",  
 
                     "uri": "http://www.tropicos.org/Name/40028244"
 
                     "uri": "http://www.tropicos.org/Name/40028244"
 
                 }
 
                 }
Line 129: Line 155:
 
         },  
 
         },  
 
         {
 
         {
             "matchCount": 4,  
+
             "matchCount": 2,
 +
            "matches": [
 +
                {
 +
                    "acceptedName": "Vitis vinifera",
 +
                    "annotations": {
 +
                        "Authority": "L."
 +
                    },
 +
                    "matchedName": "Vitis vinifera",
 +
                    "score": "1",
 +
                    "sourceId": "iPlant TNRS",
 +
                    "uri": "http://www.tropicos.org/Name/34000217"
 +
                },
 +
                {
 +
                    "acceptedName": "Vitis vinifera",
 +
                    "annotations": {},
 +
                    "matchedName": "Vitis vinifera",
 +
                    "score": "1",
 +
                    "sourceId": "NCBI",
 +
                    "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/29760"
 +
                }
 +
            ],
 +
            "submittedName": "Vitis vinifera"
 +
        },
 +
        {
 +
            "matchCount": 2,  
 
             "matches": [
 
             "matches": [
                null,
 
                null,
 
 
                 {
 
                 {
 
                     "acceptedName": "Mangifera indica",  
 
                     "acceptedName": "Mangifera indica",  
Line 140: Line 188:
 
                     "matchedName": "Mangifera indica",  
 
                     "matchedName": "Mangifera indica",  
 
                     "score": "0.98210117101673",  
 
                     "score": "0.98210117101673",  
                     "sourceId": null,  
+
                     "sourceId": "iPlant TNRS",  
 
                     "uri": "http://www.tropicos.org/Name/1300071"
 
                     "uri": "http://www.tropicos.org/Name/1300071"
 
                 },  
 
                 },  
Line 148: Line 196:
 
                     "matchedName": "Magnifera indica",  
 
                     "matchedName": "Magnifera indica",  
 
                     "score": "1",  
 
                     "score": "1",  
                     "sourceId": null,  
+
                     "sourceId": "NCBI",  
 
                     "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/29780"
 
                     "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/29780"
 
                 }
 
                 }
Line 155: Line 203:
 
         },  
 
         },  
 
         {
 
         {
             "matchCount": 3,  
+
             "matchCount": 1,  
 
             "matches": [
 
             "matches": [
                null,
 
                null,
 
 
                 {
 
                 {
 
                     "acceptedName": "Euthamia",  
 
                     "acceptedName": "Euthamia",  
Line 166: Line 212:
 
                     "matchedName": "Euthamia",  
 
                     "matchedName": "Euthamia",  
 
                     "score": "0.45701346754469",  
 
                     "score": "0.45701346754469",  
                     "sourceId": null,  
+
                     "sourceId": "iPlant TNRS",  
 
                     "uri": "http://www.tropicos.org/Name/40007649"
 
                     "uri": "http://www.tropicos.org/Name/40007649"
 
                 }
 
                 }
Line 173: Line 219:
 
         },  
 
         },  
 
         {
 
         {
             "matchCount": 4,  
+
             "matchCount": 2,  
 
             "matches": [
 
             "matches": [
                null,
 
                null,
 
 
                 {
 
                 {
 
                     "acceptedName": "Megalachne",  
 
                     "acceptedName": "Megalachne",  
Line 184: Line 228:
 
                     "matchedName": "Pantathera",  
 
                     "matchedName": "Pantathera",  
 
                     "score": "0.47790686999749",  
 
                     "score": "0.47790686999749",  
                     "sourceId": null,  
+
                     "sourceId": "iPlant TNRS",  
 
                     "uri": "http://www.tropicos.org/Name/40015658"
 
                     "uri": "http://www.tropicos.org/Name/40015658"
 
                 },  
 
                 },  
Line 192: Line 236:
 
                     "matchedName": "Panthera tigris",  
 
                     "matchedName": "Panthera tigris",  
 
                     "score": "1",  
 
                     "score": "1",  
                     "sourceId": null,  
+
                     "sourceId": "NCBI",  
 
                     "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/9694"
 
                     "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/9694"
 
                 }
 
                 }
Line 200: Line 244:
 
     ]
 
     ]
 
  }
 
  }
 +
  
 
==Demo==
 
==Demo==

Revision as of 22:32, 11 June 2012

The Taxonomic Name Resolution Service translates scientific names to scientific names as found in a TNRS, ideally identifying them by means of a URL. The goal is to standardize the names being used in the trees in Phylotastic as well as to standardize names provided by users when generating subtrees.

Team

API

GET | POST /submit

Submit a list of taxonomic names to be resolved.

URI

http://api.phylotastic.org/tnrs/submit

Parameters

  • query (required, string): A URL-encoded, newline-delimited list of taxon name (e.g. Panthera+tigris%0AEutamias+minimus%0AMagnifera+indica%0AHumbert+humbert)

Returns

Field Meaning Examples
message Human readable message "Your request is being processed. You can retrieve the results at http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15."
submit date Date and time at which the request was submitted "Mon Jun 11 20:25:16 2012"
token Unique identifier assigned to the request (jobId) "76ca0e9a3ab78e6bc5b4e362c8c40e15"
uri Address at which the results can be retrieved "http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15"

Example

GET http://api.phylotastic.org/tnrs/submit?query=Panthera+tigris%0AEutamias+minimus%0AMagnifera+indica%0AHumbert+humbert


{
   "message": "Your request is being processed. You can retrieve the results at http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15.", 
   "submit date": "Mon Jun 11 20:25:16 2012", 
   "token": "76ca0e9a3ab78e6bc5b4e362c8c40e15", 
   "uri": "http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15"
}


GET /retrieve

Retrieve the resolved names

URI

http://api.phylotastic.org/tnrs/retrieve/<token>

Parameters

  • none

Returns

Section Field Meaning Examples
metadata jobId The job-id which was submitted (for asynchronous requests) "76ca0e9a3ab78e6bc5b4e362c8c40e15"
metadata submitDate Date on which this job was submitted. "Mon Jun 11 20:25:16 2012"
metadata sources An array of all the sources available to our TNRS service, in the following format:
Field Description Example
sourceId A short string used to name this source "ITIS", "NCBI Taxonomy", "iPlant TNRS"
sourceName The full name of this source "iPlant Collaborative TNRS v3"
uri A URL used to identify this source; generally the HTTP URL for the frontpage "http://www.itis.gov/", "http://www.ncbi.nlm.nih.gov/taxonomy"
rank The rank to which we assign this source. Multiple sources *cannot* have the same rank. 1, 4, 5
status The status of this TNRS at the time of this request. Note that "offline" or "temporarily offline" TNRSes were NOT queried for the results returned in this document. Either "online" or "offline" or "temporarily offline"
annotations A dictionary containing a list of annotations which MIGHT be produced by this TNRS, mapped to descriptions of that annotation. {'nucleotide_uri': "A link to nucleotide sequences on GenBank for this taxon", 'protein_uri': "A link to protein sequences on GenBank for this taxon."}
names submittedName The name that was submitted for name resolution. "Feeelis tigris"
names matchCount The number of successful matches 0, 2, 4
names matches An array containing a list of matches, in the following format:
Field Description Example
sourceId A short string used to name the TNRS source from which this name was extracted. See metadata['sources'] to look up the metadata associated with this source. "ITIS", "NCBI Taxonomy", "iPlant TNRS"
matchedName The name matched in this TNRS from the name submitted. There MUST be a name entry in the TNRS for this name, although it is not necessarily valid/accepted. Unlike DarwinCore's scientificName field, we prefer that this not contain the taxonomic authority, although it may contain it if the TNRS does not provide a single uni/bi/trinomial. "Felis tigris"
acceptedName The currently accepted name for individuals of the taxon identified in matchedName. If the TNRS does not contain synonymy information, or If there is no currently accepted name, this field should be blank. Unlike DarwinCore's acceptedNameUsage field, we prefer that this not contain the taxonomic authority, although it may contain it if the TNRS does not provide a single uni/bi/trinomial. "Panthera tigris"
uri A URI corresponding to the acceptedName (NOT the matchedName). Ideally, this should be an HTTP URL to an RDF document, but an HTML document is also fine. TODO: We need a way of indicating whether this is an RDF document or not; either with different field names ("uri" vs "rdf") or possibly hacking it via different schemas: "http+rdf://" vs "http://", for instance. "http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:2478188" (RDF) or "http://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=183805" (HTML)
annotations A dictionary of annotations specific to this TNRS. See metadata['source'][0]['annotations'], etc. for the descriptions of these annotations. {'nucleotide_uri': "http://www.ncbi.nlm.nih.gov/nuccore/?term=txid9694[Organism:exp]", 'protein_uri': "http://www.ncbi.nlm.nih.gov/protein/?term=txid9694[Organism:exp]"}
score A score (from 0 to 1) indicating how certain the TNRS is of this match. Note that in some cases (where the TNRS does not provide scores), the controller may calculate its own score (either by calculating the number of characters different between the matchedName and the submittedName, or by simply setting it to '1.0' where they are identical and '0.5' where they are not. 0.5, 0.6667, 0.98989

Example

GET http://api.phylotastic.org/tnrs/retrieve/76ca0e9a3ab78e6bc5b4e362c8c40e15

{
   "metadata": {
       "jobId": "76ca0e9a3ab78e6bc5b4e362c8c40e15", 
       "sources": [
           {
               "annotations": {}, 
               "description": "NCBI Taxonomy", 
               "name": "NCBI", 
               "publication": "Federhen S. The Taxonomy Project.2002 Oct 9 [Updated 2003 Aug 13]. In: McEntyre J., Ostell J., editors. The NCBI Handbook [Internet]. Bethesda (MD): National Center for Biotechnology Information (US);2002.", 
               "rank": 3, 
               "sourceId": "NCBI", 
               "status": "200: OK", 
               "uri": "http://www.ncbi.nlm.nih.gov/taxonomy"
           }, 
           {
               "annotations": {
                   "Authority": "Author attributed to the accepted name (where applicable)."
               }, 
               "description": "The iPlant Collaborative TNRS provides parsing and fuzzy matching for plant taxa.", 
               "name": "iPlant Collaborative TNRS v3.0", 
               "publication": "The Taxonomic Name Resolution Service; http://tnrs.iplantcollaborative.org; version 3.0.", 
               "rank": 2, 
               "sourceId": "iPlant TNRS", 
               "status": "200: OK", 
               "uri": "http://tnrs.iplantcollaborative.org/"
           }
       ], 
       "sub_date": "Mon Jun 11 20:25:16 2012"
   }, 
   "names": [
       {
           "matchCount": 1, 
           "matches": [
               {
                   "acceptedName": "Humbertia", 
                   "annotations": {
                       "Authority": "Lam."
                   }, 
                   "matchedName": "Humbertia", 
                   "score": "0.46973019780931", 
                   "sourceId": "iPlant TNRS", 
                   "uri": "http://www.tropicos.org/Name/40028244"
               }
           ], 
           "submittedName": "Humbert humbert"
       }, 
       {
           "matchCount": 2, 
           "matches": [
               {
                   "acceptedName": "Vitis vinifera", 
                   "annotations": {
                       "Authority": "L."
                   }, 
                   "matchedName": "Vitis vinifera", 
                   "score": "1", 
                   "sourceId": "iPlant TNRS", 
                   "uri": "http://www.tropicos.org/Name/34000217"
               }, 
               {
                   "acceptedName": "Vitis vinifera", 
                   "annotations": {}, 
                   "matchedName": "Vitis vinifera", 
                   "score": "1", 
                   "sourceId": "NCBI", 
                   "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/29760"
               }
           ], 
           "submittedName": "Vitis vinifera"
       }, 
       {
           "matchCount": 2, 
           "matches": [
               {
                   "acceptedName": "Mangifera indica", 
                   "annotations": {
                       "Authority": "L."
                   }, 
                   "matchedName": "Mangifera indica", 
                   "score": "0.98210117101673", 
                   "sourceId": "iPlant TNRS", 
                   "uri": "http://www.tropicos.org/Name/1300071"
               }, 
               {
                   "acceptedName": "Mangifera indica", 
                   "annotations": {}, 
                   "matchedName": "Magnifera indica", 
                   "score": "1", 
                   "sourceId": "NCBI", 
                   "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/29780"
               }
           ], 
           "submittedName": "Magnifera indica"
       }, 
       {
           "matchCount": 1, 
           "matches": [
               {
                   "acceptedName": "Euthamia", 
                   "annotations": {
                       "Authority": "(Nutt.) Cass."
                   }, 
                   "matchedName": "Euthamia", 
                   "score": "0.45701346754469", 
                   "sourceId": "iPlant TNRS", 
                   "uri": "http://www.tropicos.org/Name/40007649"
               }
           ], 
           "submittedName": "Eutamias minimus"
       }, 
       {
           "matchCount": 2, 
           "matches": [
               {
                   "acceptedName": "Megalachne", 
                   "annotations": {
                       "Authority": "Steud."
                   }, 
                   "matchedName": "Pantathera", 
                   "score": "0.47790686999749", 
                   "sourceId": "iPlant TNRS", 
                   "uri": "http://www.tropicos.org/Name/40015658"
               }, 
               {
                   "acceptedName": "Panthera tigris", 
                   "annotations": {}, 
                   "matchedName": "Panthera tigris", 
                   "score": "1", 
                   "sourceId": "NCBI", 
                   "uri": "http://www.ncbi.nlm.nih.gov/taxonomy/9694"
               }
           ], 
           "submittedName": "Panthera tigris"
       }
   ]
}


Demo

A demonstration implementation of this API (snappily named tnrastic) was developed by the team at Phylotastic. It consists of a Perl web application using the Dancer framework which handles API requests. We've written adaptors for NCBI Taxonomy, iPlant TNRS and ITIS, as well as a hook into NCBI Taxonomy's spelling correction feature.

Adapters

Each TNRS is represented by an adaptor, which is an executable (generally either a Perl or Python script).

We use a very simple subset of our main API to communicate with adaptors. Each adaptor accepts a newline-delimited list of taxa through standard input; it writes out a JSON file to standard output in the following format in case of success:

 {"names":[{"submittedName":"Eutamias minimus","acceptedName":"Tamias minimus","score":0.5,"matchedName":"Eutamias minimus","annotations":{"TSN":"180195","originalTSN":"180144"},"uri":"http://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value=180195"}],"status":200,"errorMessage":""}

And the following in case of error:

 {"status": 500, "errorMessage": "Could not connect to the server"}

Existing TNRSes we can plug into

Please remember that any of these TNRS might have incorrect or outdated data, cross-code synonymies, or any other problem!

  • iPlant TNRS: Only plants (via Tropicos/NCBI Taxonomy/USDA Plants/Global Compositae Checklist)
  • ITIS: All life, but focuses on North American taxa
  • EOL: All life, merges multiple taxonomic trees from different providers
  • NCBI Taxonomy: All life
  • uBio: All life
  • WoRMS: Marine species
  • Global Names Index, which contains ~17million names, and returns lexical groups of similar names and links to sources
  • Global Names Recognition service, which identifies things that look like taxon names in a document or webpage


Feature Matrix

Name Animals Plants Fungi Micro Global Typos Common names Synonyms Cross ID Classification Support scores Taxonomic parsing WS info Notes
iPlant TNRS No Yes No No Partial Yes No Yes Yes Yes Yes Yes [1]? Hierarchical search possible
ITIS Yes Yes Yes  ?  ? No No Partial  ? Yes No  ? [2]
EOL Yes Yes Yes  ? Yes No Yes Partial Yes Yes No  ?  ?
NCBI Taxonomy Yes Yes Yes Yes Yes Yes No Yes ? Yes No  ? [3] Contains many taxonomically invalid names
uBio yes yes yes yes  ?  ?  ?  ?  ?  ?  ?  ? [4]
Required? Yes Yes Yes Yes Yes Yes  ? Yes  ?  ? Yes  ? NA Taxonomic parsing might be required for infraspecifics and authors


Day 1 Discussion

We came up with three alternative API designs, ranging from simple to elaborate. The choice of these strategies has to be coordinated and matched against the core architecture, especially tree storage and retrieval.

Design 1 (simple)

In the simplest scenario TNRS simply returns a list of all known possible valid names for a given (potentially invalid) name. The list of names can be annotated with attributes such as source, associated ids, their status (i.e. whether a name is the canonical name for that species), etc. In this scenario, the burden of figuring out what to do with each name is on the users of the API. The way we envision users of the API will use the returned list is by searching all the mega-trees for all the given names. So, if any of the names match a name in the mega-tree, that name should be used.

In those cases where a name is associated with multiple species, this API can try to return multiple lists, each corresponding to a different species. However, it is not always possible to (easily) figure out these cases from the output of external TNRS services we are going to use.

Design 2 (in-between)

In this design, we still have the operation described in the first design. In addition, we return one of the available names for each species as the current name. This will not have to be the correct name for the species (whatever that means), but it has to be consistent. This single consistent name will enable users of the TNRS service to match species across different trees, and to user query. But there is going to be a limitation to this consistency. Over time, what we return as the current name can change. This complicates matters in imaginable ways for the users of the API. If mega-trees are stored, the stored taxon names could become outdated (out of synch with the current name returned by TNRS). Possible solutions to this problem are:

  • Updating stored mega-trees periodically, so that they are synchronized with current names returned by the TNRS service.
  • Every time a new query comes in, we query the current name for all the taxa, updating the changed names in all the stored trees.

Design 3 (elaborate)

In the most elaborate design, we use IDs to formalize entities stored in mega-trees. We will assign one ID for each species stored in our system. Stored trees should use these IDs to store tree (not species names). TNRS service will include two operations: returning an ID given a (potentially incorrect) name, and returning a currently accepted name for a given ID. In case two species have the same name, the two species should be assigned different IDs and the service should return both IDs. A typical usage of the API will be taking user-provided names, mapping those to IDs, finding those IDs in the stored trees, prunning and grafting, and getting a tree with tips labeled with IDS; then, IDs are turned into current accepted names, and these are the names that are shown to the user.

The idea here is that IDs will be associated with species, and hence more stable through time, eliminating the need for frequent update of the stored trees.

How exactly the IDs should be assigned to species has to be discussed. We considered using existing IDs from sources such as ITIS. This can be achieved by ranking sources, but we have to be careful about whether those IDs stay constant through time. An alternative is generating new IDs internal to phylotastic (maybe not a good idea?).

General Concerns

No matter which design we choose, there are two concepts that can be implemented on top of our APIs: caching and batching. Caching will permit us to improve performance, especially for the fuzzy match which can be quite slow. Batching permits the user to search for a list of names and get a list of responses in one call.

In addition, we discussed whether our API needs to be synchronous or asynchronous. Our current thinking is that we need to provide two interfaces for each operation, one that is synchronous and does a simple and fast search (without fuzzy matching), and another one that is more thorough and is asynchronous.

Design discussion

  • Return 1 name or multiple names?
    • Scores?
  • Caching?
  • Which TNRS do fuzzy matching?

Questions/notes

  • What if we end up renaming the name-string given to us by the user? We need to make sure to have a warning to the user ("Your query 'Panthera tigris' was renamed to 'Leonardo tigris' for this search because of ...").

Galaxy specification for PhyloTNRS

(The following sample XML file is based on http://wiki.g2.bx.psu.edu/Admin/Training/ISMB2010%20Galaxy%20Tutorial:%20Running%20Your%20Own#Tools but see http://wiki.g2.bx.psu.edu/Admin/Tools/Tool%20Config%20Syntax for a full syntax)

<tool id="org.nescent.phylotastic.tnrs" name="Phylotastic TNRS">
 <description>Extracts data from multiple TNRS </description>
 <command interpreter="python">get_flanks.py $input $out_file1 $size $direction $region -o $offset -l ${input.metadata.chromCol},${input.metadata.startCol},${input.metadata.endCol},${input.metadata.strandCol}</command>
 <inputs>
   <param format="interval" name="input" type="data" label="Select data"/>
   <param name="region" type="select" label="Region">
     <option value="whole" selected="true">Whole feature</option>
     <option value="start">Around Start</option>
     <option value="end">Around End</option>
   </param>
   <param name="direction" type="select" label="Location of the flanking region/s">
     <option value="Upstream">Upstream</option>
     <option value="Downstream">Downstream</option>
     <option value="Both">Both</option>
   </param>
   <param name="offset" size="10" type="integer" value="0" label="Offset" help="Use positive values to offset co-ordinates in the direction of transcription and negative values to offset in the opposite direction."/>
   <param name="size" size="10" type="integer" value="50" label="Length of the flanking region(s)" help="Use non-negative value for length"/>
 </inputs>
 <outputs>
   <data format="interval" name="out_file1" metadata_source="input"/>
 </outputs>
  ...
 </tool>