Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[solidago] Implmented get_individual_scores & get_collective_scores #1994

Open
wants to merge 1 commit into
base: neurips24
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions solidago/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/devenv/
/dist/
**__pycache__/
.flake8/
Expand Down
143 changes: 121 additions & 22 deletions solidago/src/solidago/pipeline/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,18 @@ def ratings_properties(self) -> pd.DataFrame:
@abstractmethod
def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
raise NotImplementedError

@abstractmethod
def get_collective_scores(
self,
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
raise NotImplementedError

@abstractmethod
Expand Down Expand Up @@ -118,6 +127,18 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
dataset_zip, _headers = urlretrieve(dataset_zip) # nosec B310

with zipfile.ZipFile(dataset_zip) as zip_file:
with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.users = pd.read_csv(users_file, keep_default_na=False)
self.users.index.name = "user_id"
# Fill trust_score on newly created users for which it was not computed yet
self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0)

self.username_to_user_id = pd.Series(
data=self.users.index, index=self.users["public_username"]
)

with (zipfile.Path(zip_file) / "comparisons.csv").open(mode="rb") as comparison_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
Expand All @@ -126,30 +147,28 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
list(set(self.comparisons.video_a) | set(self.comparisons.video_b)),
name="video_id",
)
video_id_to_entity_id = {
self.video_id_to_entity_id = {
video_id: entity_id
for (entity_id, video_id) in self.entity_id_to_video_id.items()
}
self.comparisons["entity_a"] = self.comparisons["video_a"].map(
video_id_to_entity_id
self.video_id_to_entity_id
)
self.comparisons["entity_b"] = self.comparisons["video_b"].map(
video_id_to_entity_id
self.video_id_to_entity_id
)
self.comparisons.drop(columns=["video_a", "video_b"], inplace=True)
self.comparisons = self.comparisons.join(
self.username_to_user_id, on="public_username"
)

with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.users = pd.read_csv(users_file, keep_default_na=False)
self.users.index.name = "user_id"
# Fill trust_score on newly created users for which it was not computed yet
self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0)

self.username_to_user_id = pd.Series(
data=self.users.index, index=self.users["public_username"]
)
self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username")
# List of all groups of public_username,criteria,entity_id present in comparisons
user_entity_criteria_pairs = pd.concat([
self.comparisons[["public_username", "entity_a", "criteria"]]
.rename(columns={"entity_a": "entity_id"}),
self.comparisons[["public_username", "entity_b", "criteria"]]
.rename(columns={"entity_b": "entity_id"})
]) # Will contain duplicates not to be removed

with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file:
# keep_default_na=False is required otherwise some public usernames
Expand All @@ -160,11 +179,56 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]):
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False)
# Convert video to entity_id
self.collective_scores["entity_id"] = self.collective_scores["video"].map(
self.video_id_to_entity_id
)
self.collective_scores.drop(columns=["video"], inplace=True)

# Add a column "comparisons", as the number of comparisons made to this video
self.collective_scores["comparisons"] = self.collective_scores.merge(
user_entity_criteria_pairs
.groupby(["entity_id", "criteria"])
.size()
.reset_index(name="comparisons"),
how="left", # Keep all data from collective_criteria_scores in same order
on=["entity_id", "criteria"],
)["comparisons"]

# Add a column "users", as the number of different users who have rated this video
self.collective_scores["users"] = self.collective_scores.merge(
user_entity_criteria_pairs
.groupby(["entity_id", "criteria"])
.public_username
.nunique()
.reset_index(name="users"),
how="left", # Keep all data from collective_criteria_scores in same order
on=["entity_id", "criteria"],
)["users"]

with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file:
# keep_default_na=False is required otherwise some public usernames
# such as "NA" are converted to float NaN.
self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False)
# Convert video to entity_id
self.individual_scores["entity_id"] = self.individual_scores["video"].map(
self.video_id_to_entity_id
)
self.individual_scores.drop(columns=["video"], inplace=True)

# Append as a new column the number of comparison made for every user,video,criteria
self.individual_scores["comparisons"] = self.individual_scores.merge(
user_entity_criteria_pairs
.groupby(["public_username", "entity_id", "criteria"])
.size()
.reset_index(name="comparisons"),
how="left", # Keep all data from collective_criteria_scores in same order
on=["public_username", "entity_id", "criteria"],
)["comparisons"]
self.individual_scores = self.individual_scores.join(
self.username_to_user_id,
on="public_username"
)

@classmethod
def download(cls) -> "TournesolInputFromPublicDataset":
Expand All @@ -180,7 +244,15 @@ def get_comparisons(self, criteria=None, user_id=None) -> pd.DataFrame:
if "score_max" not in dtf:
# For compatibility with older datasets
dtf["score_max"] = 10
return dtf[["user_id", "entity_a", "entity_b", "criteria", "score", "score_max", "weight"]]
return dtf[[
"user_id",
"entity_a",
"entity_b",
"criteria",
"score",
"score_max",
"weight"
]]

@cached_property
def ratings_properties(self):
Expand All @@ -203,11 +275,38 @@ def ratings_properties(self):

def get_individual_scores(
self,
criteria: Optional[str] = None,
user_id: Optional[int] = None,
) -> Optional[pd.DataFrame]:
# TODO: read contributor scores from individual_scores.csv
return None
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
dtf = self.individual_scores
if criteria is not None:
dtf = dtf[dtf.criteria == criteria]
if user_id is not None:
dtf = dtf[dtf.user_id == user_id]
if entity_id is not None:
dtf = dtf[dtf.entity_id == entity_id]
return dtf[[
"user_id",
"entity_id",
"criteria",
"score",
"uncertainty",
"voting_right",
"comparisons"
]]

def get_collective_scores(
self,
entity_id: Optional[str] = None,
criteria: Optional[str] = None,
) -> pd.DataFrame:
dtf = self.collective_scores
if criteria is not None:
dtf = dtf[dtf.criteria == criteria]
if entity_id is not None:
dtf = dtf[dtf.entity_id == entity_id]
return dtf[["entity_id", "criteria", "score", "uncertainty", "users", "comparisons"]]

def get_vouches(self):
vouchers = self.vouchers[
Expand Down
105 changes: 105 additions & 0 deletions solidago/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,113 @@
from solidago.pipeline import Pipeline
from solidago.pipeline.inputs import TournesolInputFromPublicDataset


@pytest.mark.parametrize("test", range(5))
def test_pipeline_test_data(test):
td = import_module(f"data.data_{test}")
Pipeline()(td.users, td.vouches, td.entities, td.privacy, td.judgments)


def test_tournesol_get_comparisons():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_comparisons()) == 38387

# Test single filter
assert len(dataset.get_comparisons(
criteria="importance"
)) == 17143
assert len(dataset.get_comparisons(
user_id=dataset.username_to_user_id["le_science4all"]
)) == 5604

# Test all filters
assert len(dataset.get_comparisons(
criteria="largely_recommended",
user_id=dataset.username_to_user_id["lpfaucon"]
)) == 8471


def test_tournesol_get_individual_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_individual_scores()) == 17319

# Test single filter
assert len(dataset.get_individual_scores(
criteria="largely_recommended"
)) == 9176
assert len(dataset.get_individual_scores(
user_id=dataset.username_to_user_id["aidjango"]
)) == 4379
assert len(dataset.get_individual_scores(
entity_id=dataset.video_id_to_entity_id["dBap_Lp-0oc"]
)) == 5

# Test multiple filters
assert len(dataset.get_individual_scores(
criteria="importance",
user_id=dataset.username_to_user_id["biscuissec"]
)) == 1493
assert len(dataset.get_individual_scores(
criteria="largely_recommended",
entity_id=dataset.video_id_to_entity_id["zItOqgnSvi8"]
)) == 2
assert len(dataset.get_individual_scores(
user_id=dataset.username_to_user_id["amatissart"],
entity_id=dataset.video_id_to_entity_id["BTUVUg9RQSg"]
)) == 1

# Test all filters
user_id = dataset.username_to_user_id["le_science4all"]
entity_id = dataset.video_id_to_entity_id["03dTJ4nXkXw"]
found = dataset.get_individual_scores(
criteria="importance",
user_id=user_id,
entity_id=entity_id
)
assert len(found) == 1
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'user_id': user_id,
'entity_id': entity_id,
'criteria': 'importance',
'score': 82.81,
'uncertainty': 24.37,
'voting_right': 1.0,
'comparisons': 10,
}


def test_tournesol_get_collective_scores():
dataset = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip")

# Test no filter
assert len(dataset.get_collective_scores()) == 12184

# Test single filter
assert len(dataset.get_collective_scores(
criteria="largely_recommended"
)) == 6227
assert len(dataset.get_collective_scores(
entity_id=dataset.video_id_to_entity_id["kX3JKg-H5qM"]
)) == 2

# Test all filters
entity_id = dataset.video_id_to_entity_id["OlhC6n9Hhac"]
found = dataset.get_collective_scores(
criteria="importance",
entity_id=entity_id
)
assert len(found) == 1
as_dict = found.to_dict(orient="records")[0]
assert as_dict == {
'entity_id': entity_id,
'criteria': 'importance',
'score': 18.22,
'uncertainty': 60.09,
'users': 3,
'comparisons': 12,
}
Loading