-
-
Notifications
You must be signed in to change notification settings - Fork 47
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[dataviz] feat: add graph Active contributors grouped by age #1991
Changes from 8 commits
f4b8664
7dbb5cf
79dc4b6
d37c5bb
5718b25
8e536cd
ad42225
e3073bd
4f87e8b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,16 @@ | |
|
||
Comparison: | ||
A public comparison between two videos involves one or more quality criteria. | ||
|
||
Active contributor (during a period of time): | ||
An active contributor is a user who makes at least one public comparison during a given | ||
period of time. | ||
""" | ||
|
||
import pandas as pd | ||
import plotly.express as px | ||
import streamlit as st | ||
from dateutil.relativedelta import relativedelta | ||
from utils import set_df | ||
|
||
st.set_page_config( | ||
|
@@ -49,7 +55,7 @@ def add_contributors_evolution(): | |
st.plotly_chart(fig) | ||
|
||
with total_tab: | ||
fig = (df.groupby("public_username").first().groupby("week_date").size().cumsum().plot()) | ||
fig = df.groupby("public_username").first().groupby("week_date").size().cumsum().plot() | ||
fig.update_xaxes(title="Week") | ||
fig.update_yaxes(title="Total number of public contributors") | ||
fig.update_layout(showlegend=False) | ||
|
@@ -82,7 +88,106 @@ def add_comparisons_evolution(): | |
st.plotly_chart(fig) | ||
|
||
|
||
def add_comparisons_evolution_grouped_by_contributors_age(): | ||
""" | ||
Display the number of active contributors per week, grouped by age. | ||
""" | ||
|
||
st.markdown("#### Active contributors grouped by age") | ||
st.info( | ||
"An active contributor is a user who makes at least one public comparison during a given" | ||
" period of time.", | ||
) | ||
|
||
df = st.session_state.df | ||
|
||
df = df.drop_duplicates(subset=["public_username", "week_date"])[ | ||
["public_username", "week_date"] | ||
].reset_index( | ||
drop=True | ||
) # Keep only the required data, remove duplicates. | ||
|
||
df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True).astype( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using
I don't know the best practice here (I have nearly zero experience with pandas), but it looks like the If there is no side effect we may want to explicitly convert the timezone aware datetime to naive dates. In case it helps: # works
df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True)
df.week_date.cat.as_ordered().max()
# doesn't work
df.week_date = pd.to_datetime(df.week_date, infer_datetime_format=True, utc=True)
df.week_date.max()
*** TypeError: Categorical is not ordered for operation max
you can use .as_ordered() to change the Categorical to an ordered one |
||
"datetime64[ns]" | ||
) # Convert dates to sortable dates. | ||
|
||
weeks = pd.date_range( | ||
start=df.week_date.min(), end=df.week_date.max(), freq="W-MON" | ||
).to_list() # List of all weeks. | ||
|
||
# Categories: one category for each season between min(week_date) and max(week_date). A season | ||
# is a period of 3 months. | ||
seasons = pd.date_range( | ||
start=df.week_date.min().replace(month=1, day=1), | ||
end=df.week_date.max(), | ||
freq="3M", | ||
).to_list() | ||
|
||
# For each season, create a new dataframe. | ||
sub_dfs = [] | ||
|
||
# Generate a new dataframe. For each public_username, assign the season of their first public | ||
# comparison. | ||
users_seasons = ( | ||
df.groupby("public_username", as_index=False) | ||
.min() | ||
.rename(columns={"week_date": "first_week"}) | ||
) | ||
last_user_weeks = ( | ||
df.groupby("public_username", as_index=False) | ||
.max() | ||
.rename(columns={"week_date": "last_week"}) | ||
) | ||
users_seasons["last_week"] = last_user_weeks["last_week"] | ||
|
||
# Add new column in users_seasons. The value is the minimum season such as the week_date is | ||
# greater than the season date. | ||
users_seasons["season"] = users_seasons.first_week.apply( | ||
lambda first_week: max((s for s in seasons if s <= first_week), default=seasons[0]) | ||
).reindex() | ||
|
||
# If user min week_date is same as user max week_date, change its season by 'single week'. | ||
users_seasons.loc[ | ||
users_seasons.loc[users_seasons.first_week.eq(users_seasons.last_week)].index, "season" | ||
] = "single week" | ||
seasons_users = users_seasons.groupby("season")["public_username"].aggregate(list).to_dict() | ||
|
||
for s in seasons_users: | ||
# Filter df to keep only users of season s. | ||
season_df = ( | ||
df.loc[df.public_username.isin(seasons_users[s])] | ||
.groupby("week_date") | ||
.public_username.nunique() | ||
) | ||
|
||
if s == "single week": | ||
sub_dfs.append(("= last comparison date", season_df)) | ||
else: | ||
category = s.strftime("%Y %b") + " to " + (s + relativedelta(months=2)).strftime("%b") | ||
sub_dfs.append((category, season_df)) | ||
|
||
# Merge previous computed series into one, by week_date. | ||
dtf = pd.DataFrame({"week_date": weeks}).reset_index() | ||
for name, sub_df in sub_dfs: | ||
dtf = pd.merge(dtf, sub_df.to_frame(name=name), on="week_date", how="left").fillna(0) | ||
|
||
fig = px.bar( | ||
dtf, | ||
x="week_date", | ||
y=[name for name, _ in sub_dfs], | ||
labels={ | ||
"value": "Active contributors", | ||
"week_date": "Week", | ||
"variable": "First comparison date", | ||
}, | ||
color_discrete_sequence=px.colors.sample_colorscale("turbo", samplepoints=len(sub_dfs)), | ||
color_discrete_map={"= last comparison date": "grey"}, | ||
) | ||
st.plotly_chart(fig) | ||
|
||
|
||
pd.options.plotting.backend = "plotly" | ||
|
||
add_contributors_evolution() | ||
add_comparisons_evolution() | ||
add_comparisons_evolution_grouped_by_contributors_age() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not a direct dependency of the project, should we add it to the file
requirements.txt
?