Skip to content
QIUSHI BAI edited this page Aug 9, 2019 · 2 revisions

If you already have a database to hold your data, you can skip this page and jump directly to Register Dataset to register your dataset schema to Cloudberry. (Note the following tutorial pages all use the example schema below for illustration.)

This page is using Twitter as data source and AsterixDB as database to illustrate how to prepare a database to be used by Cloudberry.

Twitter data schema

The following example JSON shows the schema of original Twitter data collected from Twitter Developer API. (Unused fields are omitted)

{
   "created_at":"Tue May 08 05:08:50 +0000 2018",
   "id":993719352028467200,
   "text":"Los festejados \ud83c\udf89\ud83c\udf82 en Comisi\u00f3n Estatal Electoral Nuevo Le\u00f3n https:\/\/t.co\/XNbDcYFykg",
   "in_reply_to_status_id":null,
   "in_reply_to_user_id":null,
   "favorite_count":0,
   "coordinates":{
      "type":"Point",
      "coordinates":[
         -100.32601,
         25.68514
      ]
   },
   "retweet_count":0,
   "lang":"es",
   "retweeted":false,
   "entities":{
      "hashtags":[

      ],
      "user_mentions":[

      ]
   },
   "user":{
      "id":305955630,
      "name":"Roc\u00edo",
      "screen_name":"rocio_ae",
      "profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/727857674239401985\/fgF-nA3n_normal.jpg",
      "lang":"es",
      "location":"Nuevo Le\u00f3n ",
      "created_at":"Fri May 27 01:43:01 +0000 2011",
      "description":"\u2764\ufe0f",
      "followers_count":243,
      "friends_count":749,
      "statuses_count":4950
   },
   "place":{
      "country":"M\u00e9xico",
      "country_code":"MX",
      "full_name":"Monterrey, Nuevo Le\u00f3n",
      "id":"b19e24ce42ccd6aa",
      "name":"Monterrey",
      "place_type":"city",
      "bounding_box":{
         "type":"Polygon",
         "coordinates":[
            [
               [
                  -100.421037,
                  25.480538
               ],
               [
                  -100.421037,
                  25.802899
               ],
               [
                  -100.166146,
                  25.802899
               ],
               [
                  -100.166146,
                  25.480538
               ]
            ]
         ]
      }
   }
}

Create tables in AsterixDB

We use following SQL++ statements to create tables to hold Twitter data (looks like above) in AsterixDB.

create dataverse twitter if not exists;
use twitter;
create type typeUser if not exists as open {
    id: int64,
    name: string,
    screen_name : string,
    profile_image_url : string?,
    lang : string,
    location: string,
    create_at: date,
    description: string,
    followers_count: int32,
    friends_count: int32,
    statues_count: int64
};
create type typePlace if not exists as open{
    country : string,
    country_code : string,
    full_name : string,
    id : string,
    name : string,
    place_type : string,
    bounding_box : rectangle
};
create type typeGeoTag if not exists as open {
    stateID: int32,
    stateName: string,
    countyID: int32,
    countyName: string,
    cityID: int32?,
    cityName: string?
};
create type typeTweet if not exists as open{
    create_at : datetime,
    id: int64,
    text: string,
    in_reply_to_status : int64,
    in_reply_to_user : int64,
    favorite_count : int64,
    coordinate: point?,
    retweet_count : int64,
    lang : string,
    is_retweet: boolean,
    hashtags : {{ string }} ?,
    user_mentions : {{ int64 }} ? ,
    user : typeUser,
    place : typePlace?,
    geo_tag: typeGeoTag
};
create dataset ds_tweet(typeTweet) if not exists primary key id
with filter on create_at with {"merge-policy":{"name":"prefix","parameters":{"max-mergable-component-size":134217728, "max-tolerance-component-count":5}}};

create index text_idx if not exists on ds_tweet(text) type fulltext;

Note the following tutorial pages all use the example schema above for illustration. Now you can register the above dataset to Cloudberry Register Dataset.