Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for NoisyNER dataset #3463

Merged
merged 20 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flair/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@
NER_SWEDISH,
NER_TURKU,
NER_UKRAINIAN,
NOISY_NER_EST,
ONTONOTES,
UP_CHINESE,
UP_ENGLISH,
Expand Down Expand Up @@ -499,6 +500,7 @@
"NER_SWEDISH",
"NER_TURKU",
"NER_UKRAINIAN",
"NOISY_NER_EST",
"UP_CHINESE",
"UP_ENGLISH",
"UP_FINNISH",
Expand Down
131 changes: 131 additions & 0 deletions flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -4839,6 +4839,137 @@ def __init__(
)


class NOISY_NER_EST(ColumnCorpus):
data_url = "https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/patnlp/estner.cnll.zip"
label_url = "https://raw.githubusercontent.com/uds-lsv/NoisyNER/master/data/only_labels"

def __init__(
self,
version: int = 0,
base_path: Optional[Union[str, Path]] = None,
in_memory: bool = True,
**corpusargs,
) -> None:
"""Initialize the NoisyNER corpus.

:param version: Chooses the labelset for the data.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use the google docstring format as described in our docs

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi. I have made all the requested changes. Is the description in the correct format now?

v0 (default): clean labels
v1 to v7: different kinds of noisy labelsets (details: https://ojs.aaai.org/index.php/AAAI/article/view/16938)
:param base_path: Default is None, meaning the corpus gets automatically downloaded and saved.
You can override this by passing a path to a directory containing the unprocessed files but typically this
should not be necessary.
:param in_memory: If True the dataset is kept in memory achieving speedups in training.
"""
if version not in range(8):
raise Exception(
"Please choose a version (int) from 0 to 7. With v0 (default) you get the clean labelset for the data, while v1 to v7 provide different kinds of noisy labelsets. For details see https://ojs.aaai.org/index.php/AAAI/article/view/16938."
)

base_path = self._set_path(base_path)
features = self._load_features(base_path)

if version == 0:
preinstances = self._process_clean_labels(features)
else:
rdcd_features = self._rmv_clean_labels(features)
labels = self._load_noisy_labels(version, base_path)
preinstances = self._process_noisy_labels(rdcd_features, labels)

instances = self._delete_empty_labels(version, preinstances)

train, dev, test = self._split_data(instances)

self._write_instances(version, base_path, "train", train)
self._write_instances(version, base_path, "dev", dev)
self._write_instances(version, base_path, "test", test)

super().__init__(
data_folder=base_path,
train_file=f"estner_noisy_labelset{version}_train.tsv",
dev_file=f"estner_noisy_labelset{version}_dev.tsv",
test_file=f"estner_noisy_labelset{version}_test.tsv",
column_format={0: "text", 1: "ner"},
in_memory=in_memory,
column_delimiter="\t",
**corpusargs,
)

@classmethod
def _set_path(cls, base_path) -> Path:
base_path = flair.cache_root / "datasets" / "estner" if not base_path else Path(base_path)
return base_path

@classmethod
def _load_features(cls, base_path) -> List[List[str]]:
print(base_path)
unpack_file(cached_path(cls.data_url, base_path), base_path, "zip", False)
with open(f"{base_path}/estner.cnll") as in_file:
teresaloeffelhardt marked this conversation as resolved.
Show resolved Hide resolved
prefeatures = in_file.readlines()
features = [feature.strip().split("\t") for feature in prefeatures]
return features

@classmethod
def _process_clean_labels(cls, features) -> List[List[str]]:
preinstances = [[instance[0], instance[len(instance) - 1]] for instance in features]
return preinstances

@classmethod
def _rmv_clean_labels(cls, features) -> List[str]:
rdcd_features = [feature[:-1] for feature in features]
return rdcd_features

@classmethod
def _load_noisy_labels(cls, version, base_path) -> List[str]:
file_name = f"NoisyNER_labelset{version}.labels"
cached_path(f"{cls.label_url}/{file_name}", base_path)
with open(f"{base_path}/{file_name}") as in_file:
teresaloeffelhardt marked this conversation as resolved.
Show resolved Hide resolved
labels = in_file.read().splitlines()
return labels

@classmethod
def _process_noisy_labels(cls, rdcd_features, labels) -> List[List[str]]:
instances = []
label_idx = 0
for feature in rdcd_features:
if len(feature) == 0:
instances.append([""])
else:
assert label_idx < len(labels)
instance = [feature[0], labels[label_idx]]
instances.append(instance)
label_idx += 1
assert label_idx == len(labels), ""
return instances

@classmethod
def _delete_empty_labels(cls, version, preinstances) -> List[str]:
instances = []
if version == 0:
for instance in preinstances:
if instance[0] != "--":
instances.append(instance)
else:
for instance in preinstances:
if instance != "--":
instances.append(instance)
return instances

@classmethod
def _split_data(cls, instances) -> Tuple[List[str], List[str], List[str]]:
train = instances[:185708]
dev = instances[185708:208922]
test = instances[208922:]
return train, dev, test

@classmethod
def _write_instances(cls, version, base_path, split, data):
column_separator = "\t" # CoNLL format
with open(f"{base_path}/estner_noisy_labelset{version}_{split}.tsv", "w") as out_file:
teresaloeffelhardt marked this conversation as resolved.
Show resolved Hide resolved
for instance in data:
out_file.write(column_separator.join(instance))
out_file.write("\n")


class MASAKHA_POS(MultiCorpus):
def __init__(
self,
Expand Down
Loading