Skip to content

Commit

Permalink
fix: strip extra white spaces from lines
Browse files Browse the repository at this point in the history
* refactor: simplify text gathering logic

Signed-off-by: yshalsager <[email protected]>
  • Loading branch information
yshalsager committed Mar 8, 2022
1 parent 9f3bd63 commit 5cab9d8
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
altgraph==0.17.2; python_version >= "3.6" and python_version < "3.11" and sys_platform == "darwin"
anyio==3.5.0; python_full_version >= "3.6.2" and python_version >= "3.6"
black==21.12b0; python_full_version >= "3.6.2"
black==22.1.0; python_full_version >= "3.6.2"
certifi==2021.10.8; python_version >= "3.6"
cfgv==3.3.1; python_full_version >= "3.6.1"
charset-normalizer==2.0.12; python_full_version >= "3.5.0" and python_version >= "3.6"
Expand Down Expand Up @@ -38,5 +38,5 @@ six==1.16.0; python_full_version >= "3.6.1"
sniffio==1.2.0; python_full_version >= "3.6.2" and python_version >= "3.6"
toml==0.10.2; python_full_version >= "3.6.1"
tomli==1.2.3; python_version >= "3.6" and python_full_version >= "3.6.2"
typing-extensions==4.1.1
typing-extensions==4.1.1; python_version >= "3.6"
virtualenv==20.13.2; python_full_version >= "3.6.1"
14 changes: 11 additions & 3 deletions wit_transcriber.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import re
import traceback
from argparse import ArgumentParser
from asyncio import BoundedSemaphore, ensure_future, gather, run
Expand All @@ -25,6 +26,7 @@ def __init__(
self.lang = lang
self.chunks = 0
self.processed_chunks = 0
self._text = ""
self.text_chunks: List[Tuple[int, str]] = []
self._verbose = verbose
self._sem = BoundedSemaphore(semaphore)
Expand All @@ -33,15 +35,18 @@ def __init__(

@property
def text(self) -> str:
text = "\n".join([i[1] for i in sorted(self.text_chunks, key=lambda x: x[0])])
# text = "\n".join([i[1] for i in sorted(self.text_chunks, key=lambda x: x[0])])
text = self._text
if self.lang == "ar":
text = (
text.replace("?", "؟")
.replace(" آآ ", "")
.replace(" اه اه ", " ")
.replace(" اه ", " ")
)
text = text.replace(".", ".\n").replace(" ", " ")
# strip extra white spaces from lines
text = text.replace(".", ".\n").replace("\n ", "\n")
text = re.sub("[ ]{2,}", " ", text, re.M)
return text

def has_api_key(self) -> bool:
Expand Down Expand Up @@ -133,7 +138,10 @@ async def transcribe(self, path: Path) -> None:
ensure_future(self.__bound_fetch(chunk, idx))
for idx, chunk in enumerate(chunks)
]
await gather(*tasks)
results = await gather(*tasks)
for result in results:
if result and result[0]:
self._text += result[0]
except CouldntDecodeError:
raise Exception(
"`Error decoding the audio file.\nEnsure that the provided audio is a valid audio file!`"
Expand Down

0 comments on commit 5cab9d8

Please sign in to comment.