Improve acronym and hyphenated word handling

blakeembrey · Dec 16, 2023 · f6ce967 · f6ce967
1 parent 0f6e762
commit f6ce967
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 46 deletions.
diff --git a/packages/title-case/README.md b/packages/title-case/README.md
@@ -19,12 +19,12 @@ titleCase("follow step-by-step instructions"); //=> "Follow Step-by-Step Instruc
 
 ### Options
 
-- `locale?: string | string[]`
+- `locale?: string | string[]` Locale used for `toLocaleUpperCase` during case transformation (default: `undefined`)
 - `sentenceCase?: boolean` Only capitalize the first word of each sentence (default: `false`)
 - `sentenceTerminators?: Set<string>` Set of characters to consider a new sentence under sentence case behavior (e.g. `.`, default: `SENTENCE_TERMINATORS`)
 - `smallWords?: Set<string>` Set of words to keep lower-case when `sentenceCase === false` (default: `SMALL_WORDS`)
-- `titleTerminators?: Set<string>` Set of characters to consider a new sentence under title case behavior (e.g. `:`, default: `TITLE_TERMINATORS`).
-- `wordSeparators?: Set<string>` Set of characters to consider a new word for capitalization, such as hyphenation (default: `WORD_SEPARATORS`).
+- `titleTerminators?: Set<string>` Set of characters to consider a new sentence under title case behavior (e.g. `:`, default: `TITLE_TERMINATORS`)
+- `wordSeparators?: Set<string>` Set of characters to consider a new word for capitalization, such as hyphenation (default: `WORD_SEPARATORS`)
 
 ## TypeScript and ESM
 

diff --git a/packages/title-case/src/index.spec.ts b/packages/title-case/src/index.spec.ts
@@ -3,20 +3,31 @@ import { inspect } from "util";
 import { titleCase, Options } from "./index.js";
 
 /**
- * Based on https://github.com/gouch/to-title-case/blob/master/test/tests.json.
+ * Original tests from https://github.com/gouch/to-title-case/blob/master/test/tests.json.
  */
 const TEST_CASES: [string, string, Options?][] = [
-  ["", ""],
-  ["2019", "2019"],
-  ["test", "Test"],
-  ["two words", "Two Words"],
-  ["one. two.", "One. Two."],
+  ["one two", "One Two"],
+  ["one two three", "One Two Three"],
+  [
+    "Start a an and as at but by en for if in nor of on or per the to v vs via end",
+    "Start a an and as at but by en for if in nor of on or per the to v vs via End",
+  ],
   ["a small word starts", "A Small Word Starts"],
   ["small word ends on", "Small Word Ends On"],
+  ["questions?", "Questions?"],
+  ["Two questions?", "Two Questions?"],
+  ["one sentence. two sentences.", "One Sentence. Two Sentences."],
   ["we keep NASA capitalized", "We Keep NASA Capitalized"],
   ["pass camelCase through", "Pass camelCase Through"],
+  ["this sub-phrase is nice", "This Sub-Phrase Is Nice"],
   ["follow step-by-step instructions", "Follow Step-by-Step Instructions"],
+  ["easy as one-two-three end", "Easy as One-Two-Three End"],
+  ["start on-demand end", "Start On-Demand End"],
+  ["start in-or-out end", "Start In-or-Out End"],
+  ["start e-commerce end", "Start E-Commerce End"],
+  ["start e-mail end", "Start E-Mail End"],
   ["your hair[cut] looks (nice)", "Your Hair[cut] Looks (Nice)"],
+  ["keep that colo(u)r", "Keep that Colo(u)r"],
   ["leave Q&A unscathed", "Leave Q&A Unscathed"],
   [
     "piña colada while you listen to ænima",
@@ -31,21 +42,31 @@ const TEST_CASES: [string, string, Options?][] = [
   ['"double quotes"', '"Double Quotes"'],
   ['double quotes "inner" word', 'Double Quotes "Inner" Word'],
   ["fancy double quotes “inner” word", "Fancy Double Quotes “Inner” Word"],
+  ["'single quotes'", "'Single Quotes'"],
+  ["single quotes 'inner' word", "Single Quotes 'Inner' Word"],
+  ["fancy single quotes ‘inner’ word", "Fancy Single Quotes ‘Inner’ Word"],
+  ["“‘a twice quoted subtitle’”", "“‘A Twice Quoted Subtitle’”"],
   ["have you read “The Lottery”?", "Have You Read “The Lottery”?"],
   ["one: two", "One: Two"],
   ["one two: three four", "One Two: Three Four"],
   ['one two: "Three Four"', 'One Two: "Three Four"'],
+  ["one on: an end", "One On: An End"],
+  ['one on: "an end"', 'One On: "An End"'],
   ["email [email protected] address", "Email [email protected] Address"],
   [
     "you have an https://example.com/ title",
     "You Have an https://example.com/ Title",
   ],
   ["_underscores around words_", "_Underscores Around Words_"],
   ["*asterisks around words*", "*Asterisks Around Words*"],
-  ["this vs. that", "This vs. That"],
   ["this vs that", "This vs That"],
-  ["this v. that", "This v. That"],
+  ["this *vs* that", "This *vs* That"],
   ["this v that", "This v That"],
+  // Contractions with a period are not supported due to sentence support.
+  // It's difficult to tell if a period is part of a contraction or not.
+  ["this vs. that", "This Vs. That"],
+  ["this v. that", "This V. That"],
+  ["", ""],
   [
     "Scott Moritz and TheStreet.com’s million iPhone la-la land",
     "Scott Moritz and TheStreet.com’s Million iPhone La-La Land",
@@ -54,6 +75,7 @@ const TEST_CASES: [string, string, Options?][] = [
     "Notes and observations regarding Apple’s announcements from ‘The Beat Goes On’ special event",
     "Notes and Observations Regarding Apple’s Announcements From ‘The Beat Goes On’ Special Event",
   ],
+  ["2018", "2018"],
   [
     "the quick brown fox jumps over the lazy dog",
     "The Quick Brown Fox Jumps over the Lazy Dog",
@@ -76,17 +98,44 @@ const TEST_CASES: [string, string, Options?][] = [
   ["the iPhone: a quote", "The iPhone: A Quote"],
   ["the iPhone: a quote", "The iPhone: a quote", { sentenceCase: true }],
   ["the U.N. and me", "The U.N. and Me"],
+  ["the *U.N.* and me", "The *U.N.* and Me"],
   ["the U.N. and me", "The U.N. and me", { sentenceCase: true }],
   ["the U.N. and me", "The U.N. And Me", { smallWords: new Set() }],
   ["start-and-end", "Start-and-End"],
   ["go-to-iPhone", "Go-to-iPhone"],
+  ["the go-to", "The Go-To"],
+  ["the go-to", "The go-to", { sentenceCase: true }],
+  ["this to-go", "This To-Go"],
+  ["test(ing)", "Test(ing)"],
+  ["test(s)", "Test(s)"],
   ["Keep #tag", "Keep #tag"],
   ['"Hello world", says John.', '"Hello World", Says John.'],
   [
     '"Hello world", says John.',
     '"Hello world", says John.',
     { sentenceCase: true },
   ],
+  ["foo/bar", "Foo/Bar"],
+  ["this is the *end.*", "This Is the *End.*"],
+  ["*something about me?* and you.", "*Something About Me?* And You."],
+  [
+    "*something about me?* and you.",
+    "*Something about me?* And you.",
+    { sentenceCase: true },
+  ],
+  ["something about _me-too?_ and you.", "Something About _Me-Too?_ And You."],
+  ["something about _me_? and you.", "Something About _Me_? And You."],
+  [
+    "something about _me_? and you.",
+    "Something about _me_? And you.",
+    { sentenceCase: true },
+  ],
+  [
+    "something about _me-too_? and you too.",
+    "Something About _Me-Too_? And You Too.",
+  ],
+  ["an example. i.e. test.", "An Example. I.e. Test."],
+  ['an example. "i.e. test."', 'An Example. "I.e. Test."'],
 ];
 
 describe("swap case", () => {

diff --git a/packages/title-case/src/index.ts b/packages/title-case/src/index.ts
@@ -1,8 +1,9 @@
 const TOKENS = /(\S+)|(.)/g;
-const IS_SPECIAL_CASE = /[\.#]\p{L}/u; // #tag, example.com, etc.
+const IS_SPECIAL_CASE = /[\.#]\p{Alphabetic}/u; // #tag, example.com, etc.
 const IS_MANUAL_CASE = /\p{Ll}(?=[\p{Lu}])/u; // iPhone, iOS, etc.
-const ALPHANUMERIC_PATTERN = /[\p{L}\d]+/gu;
-const IS_ACRONYM = /(?:\p{Lu}\.){2,}$/u;
+const ALPHANUMERIC_PATTERN = /\p{Alphabetic}+/gu;
+const IS_ACRONYM =
+  /^(\P{Alphabetic})*(?:\p{Alphabetic}\.){2,}(\P{Alphabetic})*$/u;
 
 export const WORD_SEPARATORS = new Set(["—", "–", "-", "―", "/"]);
 
@@ -94,61 +95,83 @@ export function titleCase(
 
     // Ignore URLs, email addresses, acronyms, etc.
     if (IS_SPECIAL_CASE.test(token)) {
-      result += token;
-
-      // The period at the end of an acronym is not a new sentence.
-      if (IS_ACRONYM.test(token)) {
-        isNewSentence = false;
+      const acronym = token.match(IS_ACRONYM);
+
+      // The period at the end of an acronym is not a new sentence,
+      // but we should uppercase first for i.e., e.g., etc.
+      if (acronym) {
+        const [_, prefix = "", suffix = ""] = acronym;
+        result += upperAt(token, prefix.length, locale);
+        isNewSentence = terminators.has(suffix.charAt(0));
         continue;
       }
+
+      result += token;
+      isNewSentence = terminators.has(token.charAt(token.length - 1));
     } else {
       const matches = Array.from(token.matchAll(ALPHANUMERIC_PATTERN));
       let value = token;
+      let isSentenceEnd = false;
 
       for (let i = 0; i < matches.length; i++) {
         const { 0: word, index: wordIndex = 0 } = matches[i];
+        const nextChar = token.charAt(wordIndex + word.length);
+
+        isSentenceEnd = terminators.has(nextChar);
 
-        // Reset "new sentence" when we find a word.
+        // Always the capitalize first word and reset "new sentence".
         if (isNewSentence) {
           isNewSentence = false;
-        } else {
-          // Skip capitalizing all words if sentence case is enabled.
-          if (sentenceCase) {
+        }
+        // Skip capitalizing all words if sentence case is enabled.
+        else if (sentenceCase || IS_MANUAL_CASE.test(word)) {
+          continue;
+        }
+        // Handle simple words.
+        else if (matches.length === 1) {
+          // Avoid capitalizing small words, except at the end of a sentence.
+          if (smallWords.has(word)) {
+            const isFinalToken = index + token.length === input.length;
+
+            if (!isFinalToken && !isSentenceEnd) {
+              continue;
+            }
+          }
+        }
+        // Multi-word tokens need to be parsed differently.
+        else if (i > 0) {
+          // Avoid capitalizing words without a valid word separator,
+          // e.g. "apple's" or "test(ing)".
+          if (!wordSeparators.has(token.charAt(wordIndex - 1))) {
             continue;
           }
 
-          // Ignore small words except at beginning or end,
-          // or previous token is a new sentence.
-          if (
-            smallWords.has(word) &&
-            // Not the final token and word.
-            !(index + token.length === input.length && i === matches.length - 1)
-          ) {
+          // Ignore small words in the middle of hyphenated words.
+          if (smallWords.has(word) && wordSeparators.has(nextChar)) {
             continue;
           }
         }
 
-        if (IS_MANUAL_CASE.test(word)) {
-          continue;
-        }
-
-        // Only capitalize words after a valid word separator.
-        if (i > 0 && !wordSeparators.has(token.charAt(wordIndex - 1))) {
-          continue;
-        }
-
-        value =
-          value.slice(0, wordIndex) +
-          value.charAt(wordIndex).toLocaleUpperCase(locale) +
-          value.slice(wordIndex + 1);
+        value = upperAt(value, wordIndex, locale);
       }
 
       result += value;
+      isNewSentence =
+        isSentenceEnd || terminators.has(token.charAt(token.length - 1));
     }
-
-    const lastChar = token.charAt(token.length - 1);
-    isNewSentence = terminators.has(lastChar);
   }
 
   return result;
 }
+
+function upperAt(
+  input: string,
+  index: number,
+  locale: string | string[] | undefined,
+) {
+  return (
+    input.slice(0, index) +
+    input.charAt(index).toLocaleUpperCase(locale) +
+    input.slice(index + 1)
+  );
+}