odps 中的hdfs odps mapreduce

转载

西门吹雪 2024-03-04 01:37:17

文章标签 odps 中的hdfs java 类目 List 文章分类 架构后端开发

用户通过在jar命令中设置–local参数，在本地模拟MapReduce的运行过程，从而进行本地调试。本地运行时，客户端会从ODPS中下载本地调试所需要的输入表的元信息、数据，所需要的资源以及输出表的元信息，并将这些信息保存到一个名为warehouse的本地目录中。在程序运行结束后，会将计算结果输出到warehouse目录内的一个文件中。如果本地的warehouse目录下已经下载了输入表及被引用的资源，在下一次运行时，会直接引用warehouse下的数据及文件，而不会重复下载。

在本地运行过程中，仍然会启动多个Map及Reduce进程处理数据，但这些进程不是并发运行，而是依次串行运行。此外这个模拟运行过程与真正的分布式运行有如下差别：

输入表行数限制：目前，最多只会下载100行数据；
资源的使用：在分布式环境中，ODPS会限制引用资源的大小，详情请参考应用限制。但在本地运行环境中，不会有资源大小的限制；
安全限制：ODPS MapReduce及UDF程序在分布式环境中运行时受到 Java沙箱的限制。但在本地运行时，则没有此限制；

package org.digdata.purchase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.digdata.model.TrainUser;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/**
 * 
 * @author wwhhf
 * @since 2016年5月23日
 * @comment 
 *          产生user_id:->item_cat1|behavior_type1,item_cat2|behavior_type2...,item_catn
 *          |behavior_type3
 * 
 */
public class UserItemCategorySerial {

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment user_id:record
     */
    private static class UserItemCategorySerialMapper extends MapperBase {

        public void map(long key, Record record, TaskContext context)
                throws IOException {
            Long user_id = record.getBigint(0);
            Record nextkey = context.createMapOutputKeyRecord();
            nextkey.setBigint(0, user_id);
            context.write(nextkey, record);
        }

    }

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment user_id->
     */
    private static class UserItemCategorySerialReducer extends ReducerBase {

        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            List<TrainUser> list = new ArrayList<TrainUser>();
            while (values.hasNext()) {
                Record value = values.next();
                Long user_id = value.getBigint(0);
                Long item_id = value.getBigint(1);
                Long behavior_type = value.getBigint(2);
                Long item_category = value.getBigint(4);
                Date timeDate = value.getDatetime(5);
                list.add(new TrainUser(user_id, item_id, behavior_type,
                        item_category, timeDate));
            }
            if (list.size() > 0) {
                Collections.sort(list);
                StringBuffer sb = new StringBuffer();
                for (int i = 0, len = list.size(); i < len; i++) {
                    if (i != 0) {
                        sb.append(",");
                    }
                    TrainUser user = list.get(i);
                    sb.append(user.getItem_category());
                }
                Record output = context.createOutputRecord();
                output.set(new Object[] { key.get(0), sb.toString() });
                context.write(output);
            }
        }

    }

    public static void solve() throws OdpsException {
        JobConf job = new JobConf();

        job.setMapOutputKeySchema(SchemaUtils.fromString("user_id:bigint"));
        job.setMapOutputValueSchema(SchemaUtils.fromString("user_id:bigint,"
                + "item_id:bigint," + "behavior_type:bigint,"
                + "user_geohash:string," + "item_category:bigint,"
                + "time:datetime"));

        InputUtils.addTable(
                TableInfo.builder().tableName("train_user").build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("train_user_item_category")
                        .build(), job);

        job.setMapperClass(UserItemCategorySerialMapper.class);
        job.setReducerClass(UserItemCategorySerialReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

package org.digdata.purchase;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/**
 * 
 * @author wwhhf
 * @since 2016年5月23日
 * @comment item_cat1+item_cat2->weight item_cat->weight
 */
public class GenGraph {

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment item_cat1->item_cat2
     */
    private static class GenGraphMapper extends MapperBase {

        @Override
        public void map(long key, Record record, TaskContext context)
                throws IOException {
            List<Long> list = new ArrayList<Long>();
            String item_categorys[] = record.getString(1).split(",");
            for (String item_category : item_categorys) {
                list.add(Long.valueOf(item_category));
            }
            for (int i = 1, len = list.size(); i < len; i++) {
                Record nextkey = context.createMapOutputKeyRecord();
                Record nextvalue = context.createMapOutputKeyRecord();
                // (类目a->类目b的次数)/(类目a->任何类目的次数和)
                // 指定类目
                Long categorya = list.get(i);
                Long categoryb = list.get(i - 1);
                nextkey.set(new Object[] { categorya, categoryb });
                nextvalue.set(0, 1);
                context.write(nextkey, nextvalue);
                // 任何类目
                nextkey.set(new Object[] { categorya, -1 });
                nextvalue.set(0, 1);
                context.write(nextkey, nextvalue);
            }
        }

    }

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment a+b->weight || a->weight
     */
    private static class GenGraphReducer extends ReducerBase {

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            Long sum = 0L;
            while (values.hasNext()) {
                Record value = values.next();
                sum = sum + value.getBigint(0);
            }
            Long dest = key.getBigint(1);
            if (dest == -1) {
                Record output = context.createOutputRecord("point_out");
                output.set(new Object[] { key.get(0), sum });
                context.write(output, "point_out");
            } else {
                Record output = context.createOutputRecord("edge_out");
                output.set(new Object[] { key.get(0), key.get(1), sum });
                context.write(output, "edge_out");
            }
        }

    }

    public static void solve() throws OdpsException {
        JobConf job = new JobConf();

        // TODO: specify map output types
        job.setMapOutputKeySchema(SchemaUtils
                .fromString("categorya:bigint,categoryb:bigint"));
        job.setMapOutputValueSchema(SchemaUtils.fromString("sorce:bigint"));

        // TODO: specify input and output tables
        InputUtils.addTable(
                TableInfo.builder().tableName("train_user_item_category")
                        .build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("train_item_category_edge")
                        .label("edge_out").build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("train_item_category_point")
                        .label("point_out").build(), job);

        // TODO: specify a mapper
        job.setMapperClass(GenGraphMapper.class);
        // TODO: specify a reducer
        job.setReducerClass(GenGraphReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

package org.digdata.purchase;

import java.io.IOException;
import java.util.Iterator;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/**
 * 
 * @author wwhhf
 * @since 2016年5月23日
 * @comment 计算每件商品的得分：item_id+item_category->score
 */
public class CalcItemScore {

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment item_id+item_category->behavior_type
     */
    private static class CalcItemScoreMapper extends MapperBase {

        @Override
        public void map(long key, Record record, TaskContext context)
                throws IOException {
            Long item_id = record.getBigint(1);
            Long behavior_type = record.getBigint(2);
            Long item_category = record.getBigint(4);
            Record nextkey = context.createMapOutputKeyRecord();
            Record nextvalue = context.createMapOutputValueRecord();
            nextkey.set(0, item_id);
            nextkey.set(1, item_category);
            nextvalue.set(0, behavior_type);
            context.write(nextkey, nextvalue);
        }

    }

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment item_id+item_category->score
     */
    private static class CalcItemScoreReducer extends ReducerBase {

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            Long cnt[] = new Long[] { 0L, 0L, 0L, 0L };
            while (values.hasNext()) {
                Record value = values.next();
                cnt[(int) (value.getBigint(0) - 1)]++;
            }
            // 四种行为类型：浏览(0.1)，收藏(0.2)，放入购物车(0.5)，购买(1)
            Double score = cnt[0] * 0.1 + cnt[1] * 0.2 + cnt[2] * 0.5 + cnt[3]
                    * 1;
            Record output = context.createOutputRecord();
            output.set(0, key.get(0));
            output.set(1, key.get(1));
            output.set(2, score);
            context.write(output);
        }

    }

    public static void solve() throws OdpsException {
        JobConf job = new JobConf();

        // TODO: specify map output types
        job.setMapOutputKeySchema(SchemaUtils
                .fromString("item_id:bigint,item_category:bigint"));
        job.setMapOutputValueSchema(SchemaUtils
                .fromString("behavior_type:bigint"));

        // TODO: specify input and output tables
        InputUtils.addTable(
                TableInfo.builder().tableName("train_user").build(), job);
        OutputUtils.addTable(TableInfo.builder().tableName("train_item_score")
                .build(), job);

        // TODO: specify a mapper
        job.setMapperClass(CalcItemScoreMapper.class);
        // TODO: specify a reducer
        job.setReducerClass(CalcItemScoreReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

package org.digdata.purchase.hobby;

import java.io.IOException;
import java.util.Iterator;

import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/**
 * 
 * @author wwhhf
 * @since 2016年5月23日
 * @comment 计算用户购买或者浏览对类目的行为得分和所有用户对该类目的行为得分
 */
public class UserItemCategoryScore {

    /**
     * 
     * @author wwhhf
     * @since 2016年5月23日
     * @comment user_id+item_category->behavior_type
     */
    private static class UserItemCategoryScoreMapper extends MapperBase {

        @Override
        public void map(long key, Record record, TaskContext context)
                throws IOException {
            Long user_id = record.getBigint(0);
            Long behavior_type = record.getBigint(2);
            Long item_category = record.getBigint(4);
            Record nextkey = context.createMapOutputKeyRecord();
            Record nextvalue = context.createMapOutputValueRecord();
            // 指定用户
            nextkey.set(new Object[] { item_category, user_id });
            nextvalue.set(0, behavior_type);
            context.write(nextkey, nextvalue);
            // 全部用户
            nextkey.set(new Object[] { item_category, -1 });
            nextvalue.set(0, behavior_type);
            context.write(nextkey, nextvalue);
        }

    }

    private static class UserItemCategoryScoreReducer extends ReducerBase {

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            Long cnt[] = new Long[] { 0L, 0L, 0L, 0L };
            Long user_id = key.getBigint(1);
            while (values.hasNext()) {
                Record value = values.next();
                cnt[(int) (value.getBigint(0) - 1)]++;
            }
            // 四种行为类型：浏览(0.1)，收藏(0.2)，放入购物车(0.5)，购买(1)
            Double score = cnt[0] * 0.1 + cnt[1] * 0.2 + cnt[2] * 0.5 + cnt[3]
                    * 1;
            if (user_id != -1) {
                Record output = context.createOutputRecord("user_out");
                output.set(0, key.get(0));
                output.set(1, key.get(1));
                output.set(2, score);
                context.write(output, "user_out");
            } else {
                Record output = context.createOutputRecord("item_cat_out");
                output.set(0, key.get(0));
                output.set(1, score);
                context.write(output, "item_cat_out");
            }
        }

    }

    public static void solve() throws OdpsException {
        JobConf job = new JobConf();

        // TODO: specify map output types
        job.setMapOutputKeySchema(SchemaUtils
                .fromString("item_cat:bigint,user_id:bigint"));
        job.setMapOutputValueSchema(SchemaUtils
                .fromString("behavior_type:bigint"));

        // TODO: specify input and output tables
        InputUtils.addTable(
                TableInfo.builder().tableName("train_user").build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("train_item_category_score")
                        .label("item_cat_out").build(), job);
        OutputUtils.addTable(
                TableInfo.builder().tableName("train_user_item_category_score")
                        .label("user_out").build(), job);

        // TODO: specify a mapper
        job.setMapperClass(UserItemCategoryScoreMapper.class);
        // TODO: specify a reducer
        job.setReducerClass(UserItemCategoryScoreReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

package org.digdata.purchase.hobby;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.digdata.model.TrainUserCategory;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.TableInfo;
import com.aliyun.odps.mapred.JobClient;
import com.aliyun.odps.mapred.MapperBase;
import com.aliyun.odps.mapred.ReducerBase;
import com.aliyun.odps.mapred.RunningJob;
import com.aliyun.odps.mapred.conf.JobConf;
import com.aliyun.odps.mapred.utils.InputUtils;
import com.aliyun.odps.mapred.utils.OutputUtils;
import com.aliyun.odps.mapred.utils.SchemaUtils;

/**
 * 
 * @author wwhhf
 * @since 2016年5月23日
 * @comment TF-IDF算法 在一定时间内，用户购买或者浏览对类目的行为得分，除以所有用户对该类目的行为得分， 判断长期爱好或者短期需求
 */
public class Hobby {

    private static class HobbyMapper extends MapperBase {

        @Override
        public void map(long key, Record record, TaskContext context)
                throws IOException {
            TableInfo tableInfo = context.getInputTableInfo();
            String tablename = tableInfo.getTableName();
            Record nextkey = context.createMapOutputKeyRecord();
            Record nextvalue = context.createMapOutputValueRecord();
            nextkey.set(0, record.getBigint(0));
            if ("train_item_category_score".equals(tablename)) {
                nextvalue.set(new Object[] { false, -1, record.getDouble(1) });
            } else {
                nextvalue.set(new Object[] { true, record.getBigint(1),
                        record.getDouble(2) });
            }
            context.write(nextkey, nextvalue);
        }
    }

    private static class HobbyReducer extends ReducerBase {

        @Override
        public void reduce(Record key, Iterator<Record> values,
                TaskContext context) throws IOException {
            List<TrainUserCategory> list = new ArrayList<>();
            TrainUserCategory category = null;
            while (values.hasNext()) {
                Record value = values.next();
                Boolean isUser = value.getBoolean(0);
                Long user_id = value.getBigint(1);
                Double score = value.getDouble(2);
                if (isUser) {
                    list.add(new TrainUserCategory(null, user_id, score));
                } else {
                    category = new TrainUserCategory(null, null, score);
                }
            }
            for (TrainUserCategory userCategory : list) {
                Record output = context.createOutputRecord();
                output.set(new Object[] { key.get(0),
                        userCategory.getUser_id(),
                        userCategory.getScore() / category.getScore() });
                context.write(output);
            }
        }

    }

    public static void solve() throws OdpsException {
        JobConf job = new JobConf();

        // TODO: specify map output types
        job.setMapOutputKeySchema(SchemaUtils
                .fromString("item_category:bigint"));
        job.setMapOutputValueSchema(SchemaUtils
                .fromString("isUser:boolean,user_id:bigint,score:double"));

        // TODO: specify input and output tables
        InputUtils.addTable(
                TableInfo.builder().tableName("train_item_category_score")
                        .build(), job);
        InputUtils.addTable(
                TableInfo.builder().tableName("train_user_item_category_score")
                        .build(), job);
        OutputUtils.addTable(
                TableInfo.builder()
                        .tableName("train_user_item_category_score_out")
                        .build(), job);

        // TODO: specify a mapper
        job.setMapperClass(HobbyMapper.class);
        // TODO: specify a reducer
        job.setReducerClass(HobbyReducer.class);

        RunningJob rj = JobClient.runJob(job);
        rj.waitForCompletion();
    }

}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。