Skip to content

Commit

Permalink
Improve acronym and hyphenated word handling
Browse files Browse the repository at this point in the history
  • Loading branch information
blakeembrey committed Dec 16, 2023
1 parent 0f6e762 commit f6ce967
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 46 deletions.
6 changes: 3 additions & 3 deletions packages/title-case/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ titleCase("follow step-by-step instructions"); //=> "Follow Step-by-Step Instruc

### Options

- `locale?: string | string[]`
- `locale?: string | string[]` Locale used for `toLocaleUpperCase` during case transformation (default: `undefined`)
- `sentenceCase?: boolean` Only capitalize the first word of each sentence (default: `false`)
- `sentenceTerminators?: Set<string>` Set of characters to consider a new sentence under sentence case behavior (e.g. `.`, default: `SENTENCE_TERMINATORS`)
- `smallWords?: Set<string>` Set of words to keep lower-case when `sentenceCase === false` (default: `SMALL_WORDS`)
- `titleTerminators?: Set<string>` Set of characters to consider a new sentence under title case behavior (e.g. `:`, default: `TITLE_TERMINATORS`).
- `wordSeparators?: Set<string>` Set of characters to consider a new word for capitalization, such as hyphenation (default: `WORD_SEPARATORS`).
- `titleTerminators?: Set<string>` Set of characters to consider a new sentence under title case behavior (e.g. `:`, default: `TITLE_TERMINATORS`)
- `wordSeparators?: Set<string>` Set of characters to consider a new word for capitalization, such as hyphenation (default: `WORD_SEPARATORS`)

## TypeScript and ESM

Expand Down
65 changes: 57 additions & 8 deletions packages/title-case/src/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,31 @@ import { inspect } from "util";
import { titleCase, Options } from "./index.js";

/**
* Based on https://github.com/gouch/to-title-case/blob/master/test/tests.json.
* Original tests from https://github.com/gouch/to-title-case/blob/master/test/tests.json.
*/
const TEST_CASES: [string, string, Options?][] = [
["", ""],
["2019", "2019"],
["test", "Test"],
["two words", "Two Words"],
["one. two.", "One. Two."],
["one two", "One Two"],
["one two three", "One Two Three"],
[
"Start a an and as at but by en for if in nor of on or per the to v vs via end",
"Start a an and as at but by en for if in nor of on or per the to v vs via End",
],
["a small word starts", "A Small Word Starts"],
["small word ends on", "Small Word Ends On"],
["questions?", "Questions?"],
["Two questions?", "Two Questions?"],
["one sentence. two sentences.", "One Sentence. Two Sentences."],
["we keep NASA capitalized", "We Keep NASA Capitalized"],
["pass camelCase through", "Pass camelCase Through"],
["this sub-phrase is nice", "This Sub-Phrase Is Nice"],
["follow step-by-step instructions", "Follow Step-by-Step Instructions"],
["easy as one-two-three end", "Easy as One-Two-Three End"],
["start on-demand end", "Start On-Demand End"],
["start in-or-out end", "Start In-or-Out End"],
["start e-commerce end", "Start E-Commerce End"],
["start e-mail end", "Start E-Mail End"],
["your hair[cut] looks (nice)", "Your Hair[cut] Looks (Nice)"],
["keep that colo(u)r", "Keep that Colo(u)r"],
["leave Q&A unscathed", "Leave Q&A Unscathed"],
[
"piña colada while you listen to ænima",
Expand All @@ -31,21 +42,31 @@ const TEST_CASES: [string, string, Options?][] = [
['"double quotes"', '"Double Quotes"'],
['double quotes "inner" word', 'Double Quotes "Inner" Word'],
["fancy double quotes “inner” word", "Fancy Double Quotes “Inner” Word"],
["'single quotes'", "'Single Quotes'"],
["single quotes 'inner' word", "Single Quotes 'Inner' Word"],
["fancy single quotes ‘inner’ word", "Fancy Single Quotes ‘Inner’ Word"],
["“‘a twice quoted subtitle’”", "“‘A Twice Quoted Subtitle’”"],
["have you read “The Lottery”?", "Have You Read “The Lottery”?"],
["one: two", "One: Two"],
["one two: three four", "One Two: Three Four"],
['one two: "Three Four"', 'One Two: "Three Four"'],
["one on: an end", "One On: An End"],
['one on: "an end"', 'One On: "An End"'],
["email [email protected] address", "Email [email protected] Address"],
[
"you have an https://example.com/ title",
"You Have an https://example.com/ Title",
],
["_underscores around words_", "_Underscores Around Words_"],
["*asterisks around words*", "*Asterisks Around Words*"],
["this vs. that", "This vs. That"],
["this vs that", "This vs That"],
["this v. that", "This v. That"],
["this *vs* that", "This *vs* That"],
["this v that", "This v That"],
// Contractions with a period are not supported due to sentence support.
// It's difficult to tell if a period is part of a contraction or not.
["this vs. that", "This Vs. That"],
["this v. that", "This V. That"],
["", ""],
[
"Scott Moritz and TheStreet.com’s million iPhone la-la land",
"Scott Moritz and TheStreet.com’s Million iPhone La-La Land",
Expand All @@ -54,6 +75,7 @@ const TEST_CASES: [string, string, Options?][] = [
"Notes and observations regarding Apple’s announcements from ‘The Beat Goes On’ special event",
"Notes and Observations Regarding Apple’s Announcements From ‘The Beat Goes On’ Special Event",
],
["2018", "2018"],
[
"the quick brown fox jumps over the lazy dog",
"The Quick Brown Fox Jumps over the Lazy Dog",
Expand All @@ -76,17 +98,44 @@ const TEST_CASES: [string, string, Options?][] = [
["the iPhone: a quote", "The iPhone: A Quote"],
["the iPhone: a quote", "The iPhone: a quote", { sentenceCase: true }],
["the U.N. and me", "The U.N. and Me"],
["the *U.N.* and me", "The *U.N.* and Me"],
["the U.N. and me", "The U.N. and me", { sentenceCase: true }],
["the U.N. and me", "The U.N. And Me", { smallWords: new Set() }],
["start-and-end", "Start-and-End"],
["go-to-iPhone", "Go-to-iPhone"],
["the go-to", "The Go-To"],
["the go-to", "The go-to", { sentenceCase: true }],
["this to-go", "This To-Go"],
["test(ing)", "Test(ing)"],
["test(s)", "Test(s)"],
["Keep #tag", "Keep #tag"],
['"Hello world", says John.', '"Hello World", Says John.'],
[
'"Hello world", says John.',
'"Hello world", says John.',
{ sentenceCase: true },
],
["foo/bar", "Foo/Bar"],
["this is the *end.*", "This Is the *End.*"],
["*something about me?* and you.", "*Something About Me?* And You."],
[
"*something about me?* and you.",
"*Something about me?* And you.",
{ sentenceCase: true },
],
["something about _me-too?_ and you.", "Something About _Me-Too?_ And You."],
["something about _me_? and you.", "Something About _Me_? And You."],
[
"something about _me_? and you.",
"Something about _me_? And you.",
{ sentenceCase: true },
],
[
"something about _me-too_? and you too.",
"Something About _Me-Too_? And You Too.",
],
["an example. i.e. test.", "An Example. I.e. Test."],
['an example. "i.e. test."', 'An Example. "I.e. Test."'],
];

describe("swap case", () => {
Expand Down
93 changes: 58 additions & 35 deletions packages/title-case/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
const TOKENS = /(\S+)|(.)/g;
const IS_SPECIAL_CASE = /[\.#]\p{L}/u; // #tag, example.com, etc.
const IS_SPECIAL_CASE = /[\.#]\p{Alphabetic}/u; // #tag, example.com, etc.
const IS_MANUAL_CASE = /\p{Ll}(?=[\p{Lu}])/u; // iPhone, iOS, etc.
const ALPHANUMERIC_PATTERN = /[\p{L}\d]+/gu;
const IS_ACRONYM = /(?:\p{Lu}\.){2,}$/u;
const ALPHANUMERIC_PATTERN = /\p{Alphabetic}+/gu;
const IS_ACRONYM =
/^(\P{Alphabetic})*(?:\p{Alphabetic}\.){2,}(\P{Alphabetic})*$/u;

export const WORD_SEPARATORS = new Set(["—", "–", "-", "―", "/"]);

Expand Down Expand Up @@ -94,61 +95,83 @@ export function titleCase(

// Ignore URLs, email addresses, acronyms, etc.
if (IS_SPECIAL_CASE.test(token)) {
result += token;

// The period at the end of an acronym is not a new sentence.
if (IS_ACRONYM.test(token)) {
isNewSentence = false;
const acronym = token.match(IS_ACRONYM);

// The period at the end of an acronym is not a new sentence,
// but we should uppercase first for i.e., e.g., etc.
if (acronym) {
const [_, prefix = "", suffix = ""] = acronym;
result += upperAt(token, prefix.length, locale);
isNewSentence = terminators.has(suffix.charAt(0));
continue;
}

result += token;
isNewSentence = terminators.has(token.charAt(token.length - 1));
} else {
const matches = Array.from(token.matchAll(ALPHANUMERIC_PATTERN));
let value = token;
let isSentenceEnd = false;

for (let i = 0; i < matches.length; i++) {
const { 0: word, index: wordIndex = 0 } = matches[i];
const nextChar = token.charAt(wordIndex + word.length);

isSentenceEnd = terminators.has(nextChar);

// Reset "new sentence" when we find a word.
// Always the capitalize first word and reset "new sentence".
if (isNewSentence) {
isNewSentence = false;
} else {
// Skip capitalizing all words if sentence case is enabled.
if (sentenceCase) {
}
// Skip capitalizing all words if sentence case is enabled.
else if (sentenceCase || IS_MANUAL_CASE.test(word)) {
continue;
}
// Handle simple words.
else if (matches.length === 1) {
// Avoid capitalizing small words, except at the end of a sentence.
if (smallWords.has(word)) {
const isFinalToken = index + token.length === input.length;

if (!isFinalToken && !isSentenceEnd) {
continue;
}
}
}
// Multi-word tokens need to be parsed differently.
else if (i > 0) {
// Avoid capitalizing words without a valid word separator,
// e.g. "apple's" or "test(ing)".
if (!wordSeparators.has(token.charAt(wordIndex - 1))) {
continue;
}

// Ignore small words except at beginning or end,
// or previous token is a new sentence.
if (
smallWords.has(word) &&
// Not the final token and word.
!(index + token.length === input.length && i === matches.length - 1)
) {
// Ignore small words in the middle of hyphenated words.
if (smallWords.has(word) && wordSeparators.has(nextChar)) {
continue;
}
}

if (IS_MANUAL_CASE.test(word)) {
continue;
}

// Only capitalize words after a valid word separator.
if (i > 0 && !wordSeparators.has(token.charAt(wordIndex - 1))) {
continue;
}

value =
value.slice(0, wordIndex) +
value.charAt(wordIndex).toLocaleUpperCase(locale) +
value.slice(wordIndex + 1);
value = upperAt(value, wordIndex, locale);
}

result += value;
isNewSentence =
isSentenceEnd || terminators.has(token.charAt(token.length - 1));
}

const lastChar = token.charAt(token.length - 1);
isNewSentence = terminators.has(lastChar);
}

return result;
}

function upperAt(
input: string,
index: number,
locale: string | string[] | undefined,
) {
return (
input.slice(0, index) +
input.charAt(index).toLocaleUpperCase(locale) +
input.slice(index + 1)
);
}

0 comments on commit f6ce967

Please sign in to comment.